From 354fe5f52c94de1cd197e2a761934e8da814662c Mon Sep 17 00:00:00 2001 From: vedithal-amd Date: Fri, 25 Jul 2025 14:01:34 -0400 Subject: [PATCH] Unified configuration for metrics (#726) * Show description of metrics during analysis * Use --include-cols Description show the Description column in analyze mode (this is hidden by default) * Remove tips field from analysis config * Align metric names in analysis config and documentation * Add unified config utils/unified_config.yaml * Add python script utils/split_config.py to auto generate analysis configuration and documentation metrics description * Add test case to ensure unified config is older than auto-generated config * Auto generate analysis config and documentation metrics description * Update CONTRIBUTING.md to add instructions to build documentation assets * Add docker image and compose file to build documentation * Update CHANGELOG and Documentation * Use jinja template instead of hardcoding metric tables in documentation [ROCm/rocprofiler-compute commit: bb44e90b2d15d2a662b65713c825fa85cf562fb0] --- projects/rocprofiler-compute/CHANGELOG.md | 4 + projects/rocprofiler-compute/CMakeLists.txt | 10 + projects/rocprofiler-compute/CONTRIBUTING.md | 4 + .../docker/docker-compose.doctest.yml | 5 - .../docs/_templates/metrics_table.j2 | 12 + .../docs/conceptual/command-processor.rst | 103 +- .../docs/conceptual/l2-cache.rst | 570 +- .../docs/conceptual/local-data-share.rst | 141 +- .../docs/conceptual/pipeline-metrics.rst | 699 +- .../docs/conceptual/shader-engine.rst | 495 +- .../docs/conceptual/system-speed-of-light.rst | 309 +- .../docs/conceptual/vector-l1-cache.rst | 493 +- projects/rocprofiler-compute/docs/conf.py | 119 +- .../docs/data/metrics_description.yaml | 4914 +++++ .../docs/how-to/analyze/cli.rst | 5 + .../docs/sphinx/requirements.in | 1 + .../docs/sphinx/requirements.txt | 8 +- projects/rocprofiler-compute/src/argparser.py | 15 +- projects/rocprofiler-compute/src/config.py | 3 +- .../rocprof_compute_analyze/analysis_base.py | 16 +- .../gfx908/0000_top_stats.yaml | 22 +- .../gfx908/0100_system_info.yaml | 11 +- .../gfx908/0200_system-speed-of-light.yaml | 236 - .../gfx908/0200_system_speed_of_light.yaml | 317 + .../gfx908/0300_mem_chart.yaml | 310 - .../gfx908/0300_memory_chart.yaml | 267 + .../gfx908/0400_roofline.yaml | 9 + .../gfx908/0500_command-processor.yaml | 135 - .../0500_command_processor_cpc_cpf.yaml | 145 + .../gfx908/0600_shader-processor-input.yaml | 167 - .../gfx908/0600_workgroup_manager_spi.yaml | 201 + .../gfx908/0700_wavefront-launch.yaml | 142 - .../gfx908/0700_wavefront.yaml | 173 + .../1000_compute-unit-instruction-mix.yaml | 129 - .../1000_compute_units_instruction_mix.yaml | 189 + .../1100_compute-unit-compute-pipeline.yaml | 84 - .../1100_compute_units_compute_pipeline.yaml | 147 + .../analysis_configs/gfx908/1200_lds.yaml | 118 - .../gfx908/1200_local_data_share_lds.yaml | 141 + .../gfx908/1300_instruction-cache.yaml | 105 - .../gfx908/1300_instruction_cache.yaml | 106 + .../gfx908/1400_constant-cache.yaml | 171 - .../gfx908/1400_scalar_l1_data_cache.yaml | 186 + .../gfx908/1500_TA_and_TD.yaml | 168 - ...ssing_unit_and_data_return_path_ta_td.yaml | 233 + .../gfx908/1600_L1_cache.yaml | 414 - .../gfx908/1600_vector_l1_data_cache.yaml | 442 + .../gfx908/1700_L2_cache.yaml | 388 - .../gfx908/1700_l2_cache.yaml | 536 + .../gfx908/1800_L2_cache_per_channel.yaml | 350 - .../gfx908/1800_l2_cache_per_channel.yaml | 323 + .../gfx908/2100_pc_sampling.yaml | 13 +- .../gfx90a/0000_top_stats.yaml | 22 +- .../gfx90a/0100_system_info.yaml | 11 +- .../gfx90a/0200_system-speed-of-light.yaml | 254 - .../gfx90a/0200_system_speed_of_light.yaml | 337 + .../gfx90a/0300_mem_chart.yaml | 315 - .../gfx90a/0300_memory_chart.yaml | 267 + .../gfx90a/0400_roofline.yaml | 9 + .../gfx90a/0400_roofline_info.yaml | 8 - .../gfx90a/0500_command-processor.yaml | 135 - .../0500_command_processor_cpc_cpf.yaml | 145 + .../gfx90a/0600_shader-processor-input.yaml | 167 - .../gfx90a/0600_workgroup_manager_spi.yaml | 201 + .../gfx90a/0700_wavefront-launch.yaml | 142 - .../gfx90a/0700_wavefront.yaml | 173 + .../1000_compute-unit-instruction-mix.yaml | 267 - .../1000_compute_units_instruction_mix.yaml | 304 + .../1100_compute-unit-compute-pipeline.yaml | 260 - .../1100_compute_units_compute_pipeline.yaml | 316 + .../analysis_configs/gfx90a/1200_lds.yaml | 118 - .../gfx90a/1200_local_data_share_lds.yaml | 141 + .../gfx90a/1300_instruction-cache.yaml | 105 - .../gfx90a/1300_instruction_cache.yaml | 106 + .../gfx90a/1400_constant-cache.yaml | 171 - .../gfx90a/1400_scalar_l1_data_cache.yaml | 186 + .../gfx90a/1500_TA_and_TD.yaml | 174 - ...ssing_unit_and_data_return_path_ta_td.yaml | 248 + .../gfx90a/1600_L1_cache.yaml | 414 - .../gfx90a/1600_vector_l1_data_cache.yaml | 442 + .../gfx90a/1700_L2_cache.yaml | 388 - .../gfx90a/1700_l2_cache.yaml | 536 + .../gfx90a/1800_L2_cache_per_channel.yaml | 350 - .../gfx90a/1800_l2_cache_per_channel.yaml | 323 + .../gfx90a/2100_pc_sampling.yaml | 13 +- .../gfx940/0000_top_stats.yaml | 22 +- .../gfx940/0100_system_info.yaml | 11 +- .../gfx940/0200_system-speed-of-light.yaml | 262 - .../gfx940/0200_system_speed_of_light.yaml | 346 + .../gfx940/0300_mem_chart.yaml | 315 - .../gfx940/0300_memory_chart.yaml | 263 + .../gfx940/0400_roofline.yaml | 9 + .../gfx940/0400_roofline_info.yaml | 8 - .../gfx940/0500_command-processor.yaml | 135 - .../0500_command_processor_cpc_cpf.yaml | 145 + .../gfx940/0600_shader-processor-input.yaml | 167 - .../gfx940/0600_workgroup_manager_spi.yaml | 201 + .../gfx940/0700_wavefront-launch.yaml | 142 - .../gfx940/0700_wavefront.yaml | 173 + .../1000_compute-unit-instruction-mix.yaml | 277 - .../1000_compute_units_instruction_mix.yaml | 309 + .../1100_compute-unit-compute-pipeline.yaml | 273 - .../1100_compute_units_compute_pipeline.yaml | 330 + .../analysis_configs/gfx940/1200_lds.yaml | 118 - .../gfx940/1200_local_data_share_lds.yaml | 141 + .../gfx940/1300_instruction-cache.yaml | 105 - .../gfx940/1300_instruction_cache.yaml | 106 + .../gfx940/1400_constant-cache.yaml | 171 - .../gfx940/1400_scalar_l1_data_cache.yaml | 186 + .../gfx940/1500_TA_and_TD.yaml | 174 - ...ssing_unit_and_data_return_path_ta_td.yaml | 248 + .../gfx940/1600_L1_cache.yaml | 387 - .../gfx940/1600_vector_l1_data_cache.yaml | 412 + .../gfx940/1700_L2_cache.yaml | 391 - .../gfx940/1700_l2_cache.yaml | 536 + .../gfx940/1800_L2_cache_per_channel.yaml | 298 - .../gfx940/1800_l2_cache_per_channel.yaml | 251 + .../gfx940/2100_pc_sampling.yaml | 13 +- .../gfx941/0000_top_stats.yaml | 22 +- .../gfx941/0100_system_info.yaml | 11 +- .../gfx941/0200_system-speed-of-light.yaml | 262 - .../gfx941/0200_system_speed_of_light.yaml | 346 + .../gfx941/0300_mem_chart.yaml | 315 - .../gfx941/0300_memory_chart.yaml | 263 + .../gfx941/0400_roofline.yaml | 9 + .../gfx941/0400_roofline_info.yaml | 8 - .../gfx941/0500_command-processor.yaml | 135 - .../0500_command_processor_cpc_cpf.yaml | 145 + .../gfx941/0600_shader-processor-input.yaml | 167 - .../gfx941/0600_workgroup_manager_spi.yaml | 201 + .../gfx941/0700_wavefront-launch.yaml | 142 - .../gfx941/0700_wavefront.yaml | 173 + .../1000_compute-unit-instruction-mix.yaml | 277 - .../1000_compute_units_instruction_mix.yaml | 309 + .../1100_compute-unit-compute-pipeline.yaml | 273 - .../1100_compute_units_compute_pipeline.yaml | 330 + .../analysis_configs/gfx941/1200_lds.yaml | 118 - .../gfx941/1200_local_data_share_lds.yaml | 141 + .../gfx941/1300_instruction-cache.yaml | 105 - .../gfx941/1300_instruction_cache.yaml | 106 + .../gfx941/1400_constant-cache.yaml | 171 - .../gfx941/1400_scalar_l1_data_cache.yaml | 186 + .../gfx941/1500_TA_and_TD.yaml | 174 - ...ssing_unit_and_data_return_path_ta_td.yaml | 248 + .../gfx941/1600_L1_cache.yaml | 387 - .../gfx941/1600_vector_l1_data_cache.yaml | 412 + .../gfx941/1700_L2_cache.yaml | 391 - .../gfx941/1700_l2_cache.yaml | 536 + .../gfx941/1800_L2_cache_per_channel.yaml | 298 - .../gfx941/1800_l2_cache_per_channel.yaml | 251 + .../gfx941/2100_pc_sampling.yaml | 13 +- .../gfx942/0000_top_stats.yaml | 22 +- .../gfx942/0100_system_info.yaml | 11 +- .../gfx942/0200_system-speed-of-light.yaml | 262 - .../gfx942/0200_system_speed_of_light.yaml | 346 + .../gfx942/0300_mem_chart.yaml | 316 - .../gfx942/0300_memory_chart.yaml | 263 + .../gfx942/0400_roofline.yaml | 9 + .../gfx942/0400_roofline_info.yaml | 8 - .../gfx942/0500_command-processor.yaml | 135 - .../0500_command_processor_cpc_cpf.yaml | 145 + .../gfx942/0600_shader-processor-input.yaml | 167 - .../gfx942/0600_workgroup_manager_spi.yaml | 201 + .../gfx942/0700_wavefront-launch.yaml | 142 - .../gfx942/0700_wavefront.yaml | 173 + .../1000_compute-unit-instruction-mix.yaml | 277 - .../1000_compute_units_instruction_mix.yaml | 309 + .../1100_compute-unit-compute-pipeline.yaml | 273 - .../1100_compute_units_compute_pipeline.yaml | 330 + .../analysis_configs/gfx942/1200_lds.yaml | 119 - .../gfx942/1200_local_data_share_lds.yaml | 141 + .../gfx942/1300_instruction-cache.yaml | 106 - .../gfx942/1300_instruction_cache.yaml | 106 + .../gfx942/1400_constant-cache.yaml | 172 - .../gfx942/1400_scalar_l1_data_cache.yaml | 186 + .../gfx942/1500_TA_and_TD.yaml | 174 - ...ssing_unit_and_data_return_path_ta_td.yaml | 248 + .../gfx942/1600_L1_cache.yaml | 389 - .../gfx942/1600_vector_l1_data_cache.yaml | 412 + .../gfx942/1700_L2_cache.yaml | 401 - .../gfx942/1700_l2_cache.yaml | 545 + .../gfx942/1800_L2_cache_per_channel.yaml | 308 - .../gfx942/1800_l2_cache_per_channel.yaml | 251 + .../gfx942/2100_pc_sampling.yaml | 13 +- .../gfx950/0000_top_stats.yaml | 22 +- .../gfx950/0100_system_info.yaml | 11 +- .../gfx950/0200_system-speed-of-light.yaml | 269 - .../gfx950/0200_system_speed_of_light.yaml | 352 + .../gfx950/0300_mem_chart.yaml | 315 - .../gfx950/0300_memory_chart.yaml | 269 + .../gfx950/0400_roofline.yaml | 9 + .../gfx950/0400_roofline_info.yaml | 8 - .../gfx950/0500_command-processor.yaml | 153 - .../0500_command_processor_cpc_cpf.yaml | 166 + .../gfx950/0600_shader-processor-input.yaml | 188 - .../gfx950/0600_workgroup_manager_spi.yaml | 237 + .../gfx950/0700_wavefront-launch.yaml | 142 - .../gfx950/0700_wavefront.yaml | 173 + .../1000_compute-unit-instruction-mix.yaml | 289 - .../1000_compute_units_instruction_mix.yaml | 319 + .../1100_compute-unit-compute-pipeline.yaml | 293 - .../1100_compute_units_compute_pipeline.yaml | 346 + .../analysis_configs/gfx950/1200_lds.yaml | 166 - .../gfx950/1200_local_data_share_lds.yaml | 181 + .../gfx950/1300_instruction-cache.yaml | 105 - .../gfx950/1300_instruction_cache.yaml | 106 + .../gfx950/1400_constant-cache.yaml | 171 - .../gfx950/1400_scalar_l1_data_cache.yaml | 186 + .../gfx950/1500_TA_and_TD.yaml | 210 - ...ssing_unit_and_data_return_path_ta_td.yaml | 263 + .../gfx950/1600_L1_cache.yaml | 482 - .../gfx950/1600_vector_l1_data_cache.yaml | 507 + .../gfx950/1700_L2_cache.yaml | 553 - .../gfx950/1700_l2_cache.yaml | 695 + .../gfx950/1800_L2_cache_per_channel.yaml | 298 - .../gfx950/1800_l2_cache_per_channel.yaml | 257 + .../gfx950/2100_pc_sampling.yaml | 13 +- .../panel_config_template.yaml | 5 +- projects/rocprofiler-compute/src/utils/gui.py | 16 +- .../src/utils/gui_components/memchart.py | 13 +- .../src/utils/mem_chart.py | 24 +- .../src/utils/mi_gpu_spec.py | 4 +- .../rocprofiler-compute/src/utils/parser.py | 36 +- projects/rocprofiler-compute/src/utils/tty.py | 33 +- .../tests/test_TCP_counters.py | 2 +- .../tests/test_analyze_commands.py | 28 +- .../tests/test_autogen_config.py | 43 + .../tests/test_profile_general.py | 1 - .../rocprofiler-compute/tests/test_utils.py | 2 +- .../utils/autogen_hash.yaml | 110 + .../rocprofiler-compute/utils/split_config.py | 160 + .../utils/unified_config.yaml | 16496 ++++++++++++++++ 232 files changed, 44409 insertions(+), 22480 deletions(-) create mode 100644 projects/rocprofiler-compute/docs/_templates/metrics_table.j2 create mode 100644 projects/rocprofiler-compute/docs/data/metrics_description.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0200_system-speed-of-light.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0200_system_speed_of_light.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0300_mem_chart.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0300_memory_chart.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0400_roofline.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0500_command-processor.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0500_command_processor_cpc_cpf.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0600_shader-processor-input.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0600_workgroup_manager_spi.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0700_wavefront-launch.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0700_wavefront.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1000_compute-unit-instruction-mix.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1000_compute_units_instruction_mix.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1100_compute-unit-compute-pipeline.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1100_compute_units_compute_pipeline.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1200_lds.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1200_local_data_share_lds.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1300_instruction-cache.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1300_instruction_cache.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1400_constant-cache.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1400_scalar_l1_data_cache.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1500_TA_and_TD.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1500_address_processing_unit_and_data_return_path_ta_td.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1600_L1_cache.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1600_vector_l1_data_cache.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1700_L2_cache.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1700_l2_cache.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1800_L2_cache_per_channel.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1800_l2_cache_per_channel.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0200_system-speed-of-light.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0200_system_speed_of_light.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0300_mem_chart.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0300_memory_chart.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0400_roofline.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0400_roofline_info.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0500_command-processor.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0500_command_processor_cpc_cpf.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0600_shader-processor-input.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0600_workgroup_manager_spi.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0700_wavefront-launch.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0700_wavefront.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1000_compute-unit-instruction-mix.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1000_compute_units_instruction_mix.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1100_compute-unit-compute-pipeline.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1100_compute_units_compute_pipeline.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1200_lds.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1200_local_data_share_lds.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1300_instruction-cache.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1300_instruction_cache.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1400_constant-cache.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1400_scalar_l1_data_cache.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1500_TA_and_TD.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1500_address_processing_unit_and_data_return_path_ta_td.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1600_L1_cache.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1600_vector_l1_data_cache.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1700_L2_cache.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1700_l2_cache.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1800_L2_cache_per_channel.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1800_l2_cache_per_channel.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0200_system-speed-of-light.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0200_system_speed_of_light.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0300_mem_chart.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0300_memory_chart.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0400_roofline.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0400_roofline_info.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0500_command-processor.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0500_command_processor_cpc_cpf.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0600_shader-processor-input.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0600_workgroup_manager_spi.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0700_wavefront-launch.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0700_wavefront.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1000_compute-unit-instruction-mix.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1000_compute_units_instruction_mix.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1100_compute-unit-compute-pipeline.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1100_compute_units_compute_pipeline.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1200_lds.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1200_local_data_share_lds.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1300_instruction-cache.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1300_instruction_cache.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1400_constant-cache.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1400_scalar_l1_data_cache.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1500_TA_and_TD.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1500_address_processing_unit_and_data_return_path_ta_td.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1600_L1_cache.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1600_vector_l1_data_cache.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1700_L2_cache.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1700_l2_cache.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1800_L2_cache_per_channel.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1800_l2_cache_per_channel.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0200_system-speed-of-light.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0200_system_speed_of_light.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0300_mem_chart.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0300_memory_chart.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0400_roofline.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0400_roofline_info.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0500_command-processor.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0500_command_processor_cpc_cpf.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0600_shader-processor-input.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0600_workgroup_manager_spi.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0700_wavefront-launch.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0700_wavefront.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1000_compute-unit-instruction-mix.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1000_compute_units_instruction_mix.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1100_compute-unit-compute-pipeline.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1100_compute_units_compute_pipeline.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1200_lds.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1200_local_data_share_lds.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1300_instruction-cache.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1300_instruction_cache.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1400_constant-cache.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1400_scalar_l1_data_cache.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1500_TA_and_TD.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1500_address_processing_unit_and_data_return_path_ta_td.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1600_L1_cache.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1600_vector_l1_data_cache.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1700_L2_cache.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1700_l2_cache.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1800_L2_cache_per_channel.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1800_l2_cache_per_channel.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0200_system-speed-of-light.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0200_system_speed_of_light.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0300_mem_chart.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0300_memory_chart.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0400_roofline.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0400_roofline_info.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0500_command-processor.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0500_command_processor_cpc_cpf.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0600_shader-processor-input.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0600_workgroup_manager_spi.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0700_wavefront-launch.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0700_wavefront.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1000_compute-unit-instruction-mix.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1000_compute_units_instruction_mix.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1100_compute-unit-compute-pipeline.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1100_compute_units_compute_pipeline.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1200_lds.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1200_local_data_share_lds.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1300_instruction-cache.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1300_instruction_cache.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1400_constant-cache.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1400_scalar_l1_data_cache.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1500_TA_and_TD.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1500_address_processing_unit_and_data_return_path_ta_td.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1600_L1_cache.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1600_vector_l1_data_cache.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1700_L2_cache.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1700_l2_cache.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1800_L2_cache_per_channel.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1800_l2_cache_per_channel.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0200_system-speed-of-light.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0200_system_speed_of_light.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0300_mem_chart.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0300_memory_chart.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline_info.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0500_command-processor.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0500_command_processor_cpc_cpf.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0600_shader-processor-input.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0600_workgroup_manager_spi.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0700_wavefront-launch.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0700_wavefront.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1000_compute-unit-instruction-mix.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1000_compute_units_instruction_mix.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1100_compute-unit-compute-pipeline.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1100_compute_units_compute_pipeline.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1200_lds.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1200_local_data_share_lds.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1300_instruction-cache.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1300_instruction_cache.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1400_constant-cache.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1400_scalar_l1_data_cache.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1500_TA_and_TD.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1500_address_processing_unit_and_data_return_path_ta_td.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1600_L1_cache.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1600_vector_l1_data_cache.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1700_L2_cache.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1700_l2_cache.yaml delete mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1800_L2_cache_per_channel.yaml create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1800_l2_cache_per_channel.yaml create mode 100644 projects/rocprofiler-compute/tests/test_autogen_config.py create mode 100644 projects/rocprofiler-compute/utils/autogen_hash.yaml create mode 100644 projects/rocprofiler-compute/utils/split_config.py create mode 100644 projects/rocprofiler-compute/utils/unified_config.yaml diff --git a/projects/rocprofiler-compute/CHANGELOG.md b/projects/rocprofiler-compute/CHANGELOG.md index e85ebab229..62c4f55156 100644 --- a/projects/rocprofiler-compute/CHANGELOG.md +++ b/projects/rocprofiler-compute/CHANGELOG.md @@ -66,6 +66,9 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs. * Add deprecation warning for database update mode. +* Show description of metrics during analysis + * Use `--include-cols Description` to show `Description` column which is excluded by default from cli output + ### Changed * Change the default rocprof version to rocprofv3, this is used when environment variable "ROCPROF" is not set @@ -101,6 +104,7 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs. * Fixed not detecting memory clock issue when using amd-smi * Fixed standalone GUI crashing * Fixed L2 read/write/atomic bandwidths on MI350 +* Update metric names for better alignment between analysis configuration and documentation ### Known issues diff --git a/projects/rocprofiler-compute/CMakeLists.txt b/projects/rocprofiler-compute/CMakeLists.txt index 98b59131a3..2954e9b29c 100644 --- a/projects/rocprofiler-compute/CMakeLists.txt +++ b/projects/rocprofiler-compute/CMakeLists.txt @@ -335,6 +335,16 @@ add_test( ${PROJECT_SOURCE_DIR}/tests/test_utils.py WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) +# ----------------------------------- +# Autogenerated configuration tests +# ----------------------------------- + +add_test( + NAME test_autogen_config + COMMAND ${Python3_EXECUTABLE} -m pytest --junitxml=tests/test_autogen_config.xml + ${COV_OPTION} ${PROJECT_SOURCE_DIR}/tests/test_autogen_config.py + WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) + # --------- # Install # --------- diff --git a/projects/rocprofiler-compute/CONTRIBUTING.md b/projects/rocprofiler-compute/CONTRIBUTING.md index de7f57b890..c733522505 100644 --- a/projects/rocprofiler-compute/CONTRIBUTING.md +++ b/projects/rocprofiler-compute/CONTRIBUTING.md @@ -57,3 +57,7 @@ Please see the [pre-commit documentation](https://pre-commit.com/#quick-start) f Below are some repository specific guidelines which are followed througout the repository. Any future contributions should adhere to these guidelines: * Use the `pathlib` library functions instead of `os.path` for manipulating the file paths. + +## Build and test documentation changes + +For instructions on how to build and test documentation changes (files under docs folder), please see https://rocm.docs.amd.com/en/latest/contribute/contributing.html diff --git a/projects/rocprofiler-compute/docker/docker-compose.doctest.yml b/projects/rocprofiler-compute/docker/docker-compose.doctest.yml index 791ad512bb..3ccd1b76a8 100644 --- a/projects/rocprofiler-compute/docker/docker-compose.doctest.yml +++ b/projects/rocprofiler-compute/docker/docker-compose.doctest.yml @@ -3,11 +3,6 @@ services: build: context: ../ dockerfile: docker/Dockerfile.doctest - devices: - - /dev/kfd - - /dev/dri - security_opt: - - seccomp:unconfined volumes: - ../:/app tty: true diff --git a/projects/rocprofiler-compute/docs/_templates/metrics_table.j2 b/projects/rocprofiler-compute/docs/_templates/metrics_table.j2 new file mode 100644 index 0000000000..0521228d65 --- /dev/null +++ b/projects/rocprofiler-compute/docs/_templates/metrics_table.j2 @@ -0,0 +1,12 @@ +.. list-table:: + :header-rows: 1 + + * - Metric + - Description + - Unit + + {% for metric, metric_info in data.items() %} + * - {{ metric }} + - {{ metric_info.rst }} + - {{ metric_info.unit }} + {% endfor %} \ No newline at end of file diff --git a/projects/rocprofiler-compute/docs/conceptual/command-processor.rst b/projects/rocprofiler-compute/docs/conceptual/command-processor.rst index 873c8a3a68..8300625aae 100644 --- a/projects/rocprofiler-compute/docs/conceptual/command-processor.rst +++ b/projects/rocprofiler-compute/docs/conceptual/command-processor.rst @@ -46,108 +46,13 @@ processor’s metrics therefore are focused on reporting, for example: Command processor fetcher (CPF) =============================== -.. list-table:: - :header-rows: 1 - - * - Metric - - - Description - - - Unit - - * - CPF Utilization - - - Percent of total cycles where the CPF was busy actively doing any work. - The ratio of CPF busy cycles over total cycles counted by the CPF. - - - Percent - - * - CPF Stall - - - Percent of CPF busy cycles where the CPF was stalled for any reason. - - - Percent - - * - CPF-L2 Utilization - - - Percent of total cycles counted by the CPF-:doc:`L2 ` interface - where the CPF-L2 interface was active doing any work. The ratio of CPF-L2 - busy cycles over total cycles counted by the CPF-L2. - - - Percent - - * - CPF-L2 Stall - - - Percent of CPF-:doc:`L2 ` L2 busy cycles where the CPF-L2 - interface was stalled for any reason. - - - Percent - - * - CPF-UTCL1 Stall - - - Percent of CPF busy cycles where the CPF was stalled by address - translation. - - - Percent +.. jinja:: cpf-metrics + :file: _templates/metrics_table.j2 .. _cpc-metrics: Command processor packet processor (CPC) ======================================== -.. list-table:: - :header-rows: 1 - - * - Metric - - - Description - - - Unit - - * - CPC Utilization - - - Percent of total cycles where the CPC was busy actively doing any work. - The ratio of CPC busy cycles over total cycles counted by the CPC. - - - Percent - - * - CPC Stall - - - Percent of CPC busy cycles where the CPC was stalled for any reason. - - - Percent - - * - CPC Packet Decoding Utilization - - - Percent of CPC busy cycles spent decoding commands for processing. - - - Percent - - * - CPC-Workgroup Manager Utilization - - - Percent of CPC busy cycles spent dispatching workgroups to the - :ref:`workgroup manager `. - - - Percent - - * - CPC-L2 Utilization - - - Percent of total cycles counted by the CPC-:doc:`L2 ` interface - where the CPC-L2 interface was active doing any work. - - - Percent - - * - CPC-UTCL1 Stall - - - Percent of CPC busy cycles where the CPC was stalled by address - translation. - - - Percent - - * - CPC-UTCL2 Utilization - - - Percent of total cycles counted by the CPC's :doc:`L2 ` address - translation interface where the CPC was busy doing address translation - work. - - - Percent +.. jinja:: cpc-metrics + :file: _templates/metrics_table.j2 diff --git a/projects/rocprofiler-compute/docs/conceptual/l2-cache.rst b/projects/rocprofiler-compute/docs/conceptual/l2-cache.rst index b9752f1baa..fabb56d027 100644 --- a/projects/rocprofiler-compute/docs/conceptual/l2-cache.rst +++ b/projects/rocprofiler-compute/docs/conceptual/l2-cache.rst @@ -48,56 +48,8 @@ The L2 cache’s speed-of-light table contains a few key metrics about the performance of the L2 cache, aggregated over all the L2 channels, as a comparison with the peak achievable values of those metrics: -.. list-table:: - :header-rows: 1 - - * - Metric - - - Description - - - Unit - - * - Utilization - - - The ratio of the - :ref:`number of cycles an L2 channel was active, summed over all L2 channels on the accelerator ` - over the :ref:`total L2 cycles `. - - - Percent - - * - Bandwidth - - - The number of bytes looked up in the L2 cache, as a percent of the peak - theoretical bandwidth achievable on the specific accelerator. The number - of bytes is calculated as the number of cache lines requested multiplied - by the cache line size. This value does not consider partial requests, so - e.g., if only a single value is requested in a cache line, the data - movement will still be counted as a full cache line. - - - Percent - - * - Hit Rate - - - The ratio of the number of L2 cache line requests that hit in the L2 - cache over the total number of incoming cache line requests to the L2 - cache. - - - Percent - - * - L2-Fabric Read BW - - - The number of bytes read by the L2 over the - :ref:`Infinity Fabric interface ` per unit time. - - - GB/s - - * - L2-Fabric Write and Atomic BW - - - The number of bytes sent by the L2 over the - :ref:`Infinity Fabric interface ` by write and atomic - operations per unit time. - - - GB/s +.. jinja:: l2-sol + :file: _templates/metrics_table.j2 .. note:: @@ -117,168 +69,8 @@ This section details the incoming requests to the L2 cache from the :doc:`vL1D ` and other clients -- for instance, the :ref:`sL1D ` and :ref:`L1I ` caches. -.. list-table:: - :header-rows: 1 - :widths: 13 70 17 - - * - Metric - - - Description - - - Unit - - * - Bandwidth - - - The number of bytes looked up in the L2 cache, per - :ref:`normalization unit `. The number of bytes is - calculated as the number of cache lines requested multiplied by the cache - line size. This value does not consider partial requests, so for example, - if only a single value is requested in a cache line, the data movement - will still be counted as a full cache line. - - - Bytes per :ref:`normalization unit `. - - * - Requests - - - The total number of incoming requests to the L2 from all clients for all - request types, per :ref:`normalization unit `. - - - Requests per :ref:`normalization unit `. - - * - Read Requests - - - The total number of read requests to the L2 from all clients. - - - Requests per :ref:`normalization unit ` - - * - Write Requests - - - The total number of write requests to the L2 from all clients. - - - Requests per :ref:`normalization unit ` - - * - Atomic Requests - - - The total number of atomic requests (with and without return) to the L2 - from all clients. - - - Requests per :ref:`normalization unit ` - - * - Streaming Requests - - - The total number of incoming requests to the L2 that are marked as - *streaming*. The exact meaning of this may differ depending on the - targeted accelerator, however on an :ref:`MI2XX ` this - corresponds to - `non-temporal load or stores `_. - The L2 cache attempts to evict *streaming* requests before normal - requests when the L2 is at capacity. - - - Requests per :ref:`normalization unit ` - - * - Probe Requests - - - The number of coherence probe requests made to the L2 cache from outside - the accelerator. On an :ref:`MI2XX `, probe requests may be - generated by, for example, writes to - :ref:`fine-grained device ` memory or by writes to - :ref:`coarse-grained ` device memory. - - - Requests per :ref:`normalization unit ` - - * - Hit Rate - - - The ratio of the number of L2 cache line requests that hit in the L2 - cache over the total number of incoming cache line requests to the L2 - cache. - - - Percent - - * - Hits - - - The total number of requests to the L2 from all clients that hit in the - cache. As noted in the :ref:`Speed-of-Light ` section, this - includes hit-on-miss requests. - - - Requests per :ref:`normalization unit ` - - * - Misses - - - The total number of requests to the L2 from all clients that miss in the - cache. As noted in the :ref:`Speed-of-Light ` section, these do - not include hit-on-miss requests. - - - Requests per :ref:`normalization unit ` - - * - Writebacks - - - The total number of L2 cache lines written back to memory for any reason. - Write-backs may occur due to user code (such as HIP kernel calls to - ``__threadfence_system`` or atomic built-ins) by the - :doc:`command processor `'s memory acquire/release - fences, or for other internal hardware reasons. - - - Cache lines per :ref:`normalization unit ` - - * - Writebacks (Internal) - - - The total number of L2 cache lines written back to memory for internal - hardware reasons, per :ref:`normalization unit `. - - - Cache lines per :ref:`normalization unit `. - - * - Writebacks (vL1D Req) - - - The total number of L2 cache lines written back to memory due to requests - initiated by the :doc:`vL1D cache `, per - :ref:`normalization unit `. - - - Cache lines per :ref:`normalization unit `. - - * - Evictions (Normal) - - - The total number of L2 cache lines evicted from the cache due to capacity - limits, per :ref:`normalization unit `. - - - Cache lines per :ref:`normalization unit `. - - * - Evictions (vL1D Req) - - - The total number of L2 cache lines evicted from the cache due to - invalidation requests initiated by the - :doc:`vL1D cache `, per - :ref:`normalization unit `. - - - Cache lines per :ref:`normalization unit `. - - * - Non-hardware-Coherent Requests - - - The total number of requests to the L2 to Not-hardware-Coherent (NC) - memory allocations, per :ref:`normalization unit `. - See the :ref:`memory-type` for more information. - - - Requests per :ref:`normalization unit `. - - * - Uncached Requests - - - The total number of requests to the L2 that go to Uncached (UC) memory - allocations. See the :ref:`memory-type` for more information. - - - Requests per :ref:`normalization unit `. - - * - Coherently Cached Requests - - - The total number of requests to the L2 that go to Coherently Cacheable (CC) - memory allocations. See the :ref:`memory-type` for more information. - - - Requests per :ref:`normalization unit `. - - * - Read/Write Coherent Requests - - - The total number of requests to the L2 that go to Read-Write coherent memory - (RW) allocations. See the :ref:`memory-type` for more information. - - - Requests per :ref:`normalization unit `. +.. jinja:: l2-cache-accesses + :file: _templates/metrics_table.j2 .. note:: @@ -300,7 +92,7 @@ is responsible for routing these memory requests/data to the correct location and returning any fetched data to the L2 cache. The :ref:`l2-request-flow` describes the flow of these requests through Infinity Fabric in more detail, as described by ROCm Compute Profiler metrics, -while :ref:`l2-request-metrics` give detailed definitions of +while :ref:`l2-fabric` give detailed definitions of individual metrics. .. _l2-request-flow: @@ -363,176 +155,15 @@ to uncached memory (denoted by the dashed line), they will also be counted as *two* uncached read requests (that is, the request is split). -.. _l2-request-metrics: +.. _l2-fabric-metrics: Metrics ------- The following metrics are reported for the L2-Fabric interface: -.. list-table:: - :header-rows: 1 - - * - Metric - - - Description - - - Unit - - * - L2-Fabric Read Bandwidth - - - The total number of bytes read by the L2 cache from Infinity Fabric per - :ref:`normalization unit `. - - - Bytes per :ref:`normalization unit `. - - * - HBM Read Traffic - - - The percent of read requests generated by the L2 cache that are routed to - the accelerator's local high-bandwidth memory (HBM). This breakdown does - not consider the *size* of the request (meaning that 32B and 64B requests - are both counted as a single request), so this metric only *approximates* - the percent of the L2-Fabric Read bandwidth directed to the local HBM. - - - Percent - - * - Remote Read Traffic - - - The percent of read requests generated by the L2 cache that are routed to - any memory location other than the accelerator's local high-bandwidth - memory (HBM) -- for example, the CPU's DRAM or a remote accelerator's - HBM. This breakdown does not consider the *size* of the request (meaning - that 32B and 64B requests are both counted as a single request), so this - metric only *approximates* the percent of the L2-Fabric Read bandwidth - directed to a remote location. - - - Percent - - * - Uncached Read Traffic - - - The percent of read requests generated by the L2 cache that are reading - from an :ref:`uncached memory allocation `. Note, as - described in the :ref:`request flow ` section, a single - 64B read request is typically counted as two uncached read requests. So, - it is possible for the Uncached Read Traffic to reach up to 200% of the - total number of read requests. This breakdown does not consider the - *size* of the request (i.e., 32B and 64B requests are both counted as a - single request), so this metric only *approximates* the percent of the - L2-Fabric read bandwidth directed to an uncached memory location. - - - Percent - - * - L2-Fabric Write and Atomic Bandwidth - - - The total number of bytes written by the L2 over Infinity Fabric by write - and atomic operations per - :ref:`normalization unit `. Note that on current - CDNA accelerators, such as the :ref:`MI2XX `, requests are - only considered *atomic* by Infinity Fabric if they are targeted at - non-write-cacheable memory, for example, - :ref:`fine-grained memory ` allocations or - :ref:`uncached memory ` allocations on the - MI2XX. - - - Bytes per :ref:`normalization unit `. - - * - HBM Write and Atomic Traffic - - - The percent of write and atomic requests generated by the L2 cache that - are routed to the accelerator's local high-bandwidth memory (HBM). This - breakdown does not consider the *size* of the request (meaning that 32B - and 64B requests are both counted as a single request), so this metric - only *approximates* the percent of the L2-Fabric Write and Atomic - bandwidth directed to the local HBM. Note that on current CDNA - accelerators, such as the :ref:`MI2XX `, requests are only - considered *atomic* by Infinity Fabric if they are targeted at - :ref:`fine-grained memory ` allocations or - :ref:`uncached memory ` allocations. - - - Percent - - * - Remote Write and Atomic Traffic - - - The percent of read requests generated by the L2 cache that are routed to - any memory location other than the accelerator's local high-bandwidth - memory (HBM) -- for example, the CPU's DRAM or a remote accelerator's - HBM. This breakdown does not consider the *size* of the request (meaning - that 32B and 64B requests are both counted as a single request), so this - metric only *approximates* the percent of the L2-Fabric Read bandwidth - directed to a remote location. Note that on current CDNA - accelerators, such as the :ref:`MI2XX `, requests are only - considered *atomic* by Infinity Fabric if they are targeted at - :ref:`fine-grained memory ` allocations or - :ref:`uncached memory ` allocations. - - - Percent - - * - Atomic Traffic - - - The percent of write requests generated by the L2 cache that are atomic - requests to *any* memory location. This breakdown does not consider the - *size* of the request (meaning that 32B and 64B requests are both counted - as a single request), so this metric only *approximates* the percent of - the L2-Fabric Read bandwidth directed to a remote location. Note that on - current CDNA accelerators, such as the :ref:`MI2XX `, - requests are only considered *atomic* by Infinity Fabric if they are - targeted at :ref:`fine-grained memory ` allocations or - :ref:`uncached memory ` allocations. - - - Percent - - * - Uncached Write and Atomic Traffic - - - The percent of write and atomic requests generated by the L2 cache that - are targeting :ref:`uncached memory allocations `. This - breakdown does not consider the *size* of the request (meaning that 32B - and 64B requests are both counted as a single request), so this metric - only *approximates* the percent of the L2-Fabric read bandwidth directed - to uncached memory allocations. - - - Percent - - * - Read Latency - - - The time-averaged number of cycles read requests spent in Infinity Fabric - before data was returned to the L2. - - - Cycles - - * - Write Latency - - - The time-averaged number of cycles write requests spent in Infinity - Fabric before a completion acknowledgement was returned to the L2. - - - Cycles - - * - Atomic Latency - - - The time-averaged number of cycles atomic requests spent in Infinity - Fabric before a completion acknowledgement (atomic without return value) - or data (atomic with return value) was returned to the L2. - - - Cycles - - * - Read Stall - - - The ratio of the total number of cycles the L2-Fabric interface was - stalled on a read request to any destination (local HBM, remote PCIe® - connected accelerator or CPU, or remote Infinity Fabric connected - accelerator [#inf]_ or CPU) over the - :ref:`total active L2 cycles `. - - - Percent - - * - Write Stall - - - The ratio of the total number of cycles the L2-Fabric interface was - stalled on a write or atomic request to any destination (local HBM, - remote accelerator or CPU, PCIe connected accelerator or CPU, or remote - Infinity Fabric connected accelerator [#inf]_ or CPU) over the - :ref:`total active L2 cycles `. - - - Percent +.. jinja:: l2-fabric-metrics + :file: _templates/metrics_table.j2 .. _l2-detailed-metrics: @@ -542,121 +173,8 @@ Detailed transaction metrics The following metrics are available in the detailed L2-Fabric transaction breakdown table: -.. list-table:: - :header-rows: 1 - - * - Metric - - - Description - - - Unit - - * - 32B Read Requests - - - The total number of L2 requests to Infinity Fabric to read 32B of data - from any memory location, per - :ref:`normalization unit `. See - :ref:`l2-request-flow` for more detail. Typically unused on CDNA - accelerators. - - - Requests per :ref:`normalization unit `. - - * - Uncached Read Requests - - - The total number of L2 requests to Infinity Fabric to read - :ref:`uncached data ` from any memory location, per - :ref:`normalization unit `. 64B requests for - uncached data are counted as two 32B uncached data requests. See - :ref:`l2-request-flow` for more detail. - - - Requests per :ref:`normalization unit `. - - * - 64B Read Requests - - - The total number of L2 requests to Infinity Fabric to read 64B of data - from any memory location, per - :ref:`normalization unit `. See - :ref:`l2-request-flow` for more detail. - - - Requests per :ref:`normalization unit `. - - * - HBM Read Requests - - - The total number of L2 requests to Infinity Fabric to read 32B or 64B of - data from the accelerator's local HBM, per - :ref:`normalization unit `. See - :ref:`l2-request-flow` for more detail. - - - Requests per :ref:`normalization unit `. - - * - Remote Read Requests - - - The total number of L2 requests to Infinity Fabric to read 32B or 64B of - data from any source other than the accelerator's local HBM, per - :ref:`normalization unit `. See - :ref:`l2-request-flow` for more detail. - - - Requests per :ref:`normalization unit `. - - * - 32B Write and Atomic Requests - - - The total number of L2 requests to Infinity Fabric to write or atomically - update 32B of data to any memory location, per - :ref:`normalization unit `. See - :ref:`l2-request-flow` for more detail. - - - Requests per :ref:`normalization unit `. - - * - Uncached Write and Atomic Requests - - - The total number of L2 requests to Infinity Fabric to write or atomically - update 32B or 64B of :ref:`uncached data `, per - :ref:`normalization unit `. See - :ref:`l2-request-flow` for more detail. - - - Requests per :ref:`normalization unit `. - - * - 64B Write and Atomic Requests - - - The total number of L2 requests to Infinity Fabric to write or atomically - update 64B of data in any memory location, per - :ref:`normalization unit `. See - :ref:`l2-request-flow` for more detail. - - - Requests per :ref:`normalization unit `. - - * - HBM Write and Atomic Requests - - - The total number of L2 requests to Infinity Fabric to write or atomically - update 32B or 64B of data in the accelerator's local HBM, per - :ref:`normalization unit `. See - :ref:`l2-request-flow` for more detail. - - - Requests per :ref:`normalization unit `. - - * - Remote Write and Atomic Requests - - - The total number of L2 requests to Infinity Fabric to write or atomically - update 32B or 64B of data in any memory location other than the - accelerator's local HBM, per - :ref:`normalization unit `. See - :ref:`l2-request-flow` for more detail. - - - Requests per :ref:`normalization unit `. - - * - Atomic Requests - - - The total number of L2 requests to Infinity Fabric to atomically update - 32B or 64B of data in any memory location, per - :ref:`normalization unit `. See - :ref:`l2-request-flow` for more detail. Note that on current CDNA - accelerators, such as the :ref:`MI2XX `, requests are only - considered *atomic* by Infinity Fabric if they are targeted at - non-write-cacheable memory, such as - :ref:`fine-grained memory ` allocations or - :ref:`uncached memory ` allocations on the MI2XX. - - - Requests per :ref:`normalization unit `. +.. jinja:: l2-detailed-metrics + :file: _templates/metrics_table.j2 .. _l2-fabric-stalls: @@ -670,72 +188,8 @@ what types of requests in a kernel caused a stall (like read versus write), and to which locations -- for instance, to the accelerator’s local memory, or to remote accelerators or CPUs. -.. list-table:: - :header-rows: 1 - - * - Metric - - - Description - - - Unit - - * - Read - PCIe Stall - - - The number of cycles the L2-Fabric interface was stalled on read requests - to remote PCIe connected accelerators [#inf]_ or CPUs as a percent of the - :ref:`total active L2 cycles `. - - - Percent - - * - Read - Infinity Fabric Stall - - - The number of cycles the L2-Fabric interface was stalled on read requests - to remote Infinity Fabric connected accelerators [#inf]_ or CPUs as a - percent of the :ref:`total active L2 cycles `. - - - Percent - - * - Read - HBM Stall - - - The number of cycles the L2-Fabric interface was stalled on read requests - to the accelerator's local HBM as a percent of the - :ref:`total active L2 cycles `. - - - Percent - - * - Write - PCIe Stall - - - The number of cycles the L2-Fabric interface was stalled on write or - atomic requests to remote PCIe connected accelerators [#inf]_ or CPUs as - a percent of the :ref:`total active L2 cycles `. - - - Percent - - * - Write - Infinity Fabric Stall - - - The number of cycles the L2-Fabric interface was stalled on write or - atomic requests to remote Infinity Fabric connected accelerators [#inf]_ - or CPUs as a percent of the - :ref:`total active L2 cycles `. - - - Percent - - * - Write - HBM Stall - - - The number of cycles the L2-Fabric interface was stalled on write or - atomic requests to accelerator's local HBM as a percent of the - :ref:`total active L2 cycles `. - - - Percent - - * - Write - Credit Starvation - - - The number of cycles the L2-Fabric interface was stalled on write or - atomic requests to any memory location because too many write/atomic - requests were currently in flight, as a percent of the - :ref:`total active L2 cycles `. - - - Percent +.. jinja:: l2-fabric-stalls + :file: _templates/metrics_table.j2 .. warning:: diff --git a/projects/rocprofiler-compute/docs/conceptual/local-data-share.rst b/projects/rocprofiler-compute/docs/conceptual/local-data-share.rst index 121384de42..9d991b04fc 100644 --- a/projects/rocprofiler-compute/docs/conceptual/local-data-share.rst +++ b/projects/rocprofiler-compute/docs/conceptual/local-data-share.rst @@ -21,53 +21,8 @@ LDS Speed-of-Light The :ref:`LDS ` speed-of-light chart shows a number of key metrics for the LDS as a comparison with the peak achievable values of those metrics. -.. list-table:: - :header-rows: 1 - - * - Metric - - - Description - - - Unit - - * - Utilization - - - Indicates what percent of the kernel's duration the :ref:`LDS ` - was actively executing instructions (including, but not limited to, load, - store, atomic and HIP's ``__shfl`` operations). Calculated as the ratio - of the total number of cycles LDS was active over the - :ref:`total CU cycles `. - - - Percent - - * - Access Rate - - - Indicates the percentage of SIMDs in the :ref:`VALU ` [#lds-workload]_ - actively issuing LDS instructions, averaged over the lifetime of the - kernel. Calculated as the ratio of the total number of cycles spent by - the :ref:`scheduler ` issuing :ref:`LDS ` - instructions over the - :ref:`total CU cycles `. - - - Percent - - * - Theoretical Bandwidth (% of Peak) - - - Indicates the maximum amount of bytes that *could* have been loaded from, - stored to, or atomically updated in the LDS in this kernel, as a percent - of the peak LDS bandwidth achievable. See the - :ref:`LDS bandwidth example ` for more detail. - - - Percent - - * - Bank Conflict Rate - - - Indicates the percentage of active LDS cycles that were spent servicing - bank conflicts. Calculated as the ratio of LDS cycles spent servicing - bank conflicts over the number of LDS cycles that would have been - required to move the same amount of data in an uncontended access. [#lds-bank-conflict]_ - - - Percent +.. jinja:: lds-sol + :file: _templates/metrics_table.j2 .. rubric:: Footnotes @@ -90,93 +45,5 @@ Statistics The LDS statistics panel gives a more detailed view of the hardware: -.. list-table:: - :header-rows: 1 - - * - Metric - - - Description - - - Unit - - * - LDS Instructions - - - The total number of LDS instructions (including, but not limited to, - read/write/atomics and HIP's ``__shfl`` instructions) executed per - :ref:`normalization unit `. - - - Instructions per :ref:`normalization unit ` - - * - Theoretical Bandwidth - - - Indicates the maximum amount of bytes that could have been loaded from, - stored to, or atomically updated in the LDS per - :ref:`normalization unit `. Does *not* take into - account the execution mask of the wavefront when the instruction was - executed. See the - :ref:`LDS bandwidth example ` for more detail. - - - Bytes per :ref:`normalization unit ` - - * - LDS Latency - - - The average number of round-trip cycles (i.e., from issue to data-return - / acknowledgment) required for an LDS instruction to complete. - - - Cycles - - * - Bank Conflicts/Access - - - The ratio of the number of cycles spent in the - :ref:`LDS scheduler ` due to bank conflicts (as determined by - the conflict resolution hardware) to the base number of cycles that would - be spent in the LDS scheduler in a completely uncontended case. This is - the unnormalized form of the Bank Conflict Rate. - - - Conflicts/Access - - * - Index Accesses - - - The total number of cycles spent in the :ref:`LDS scheduler ` - over all operations per :ref:`normalization unit `. - - - Cycles per :ref:`normalization unit ` - - * - Atomic Return Cycles - - - The total number of cycles spent on LDS atomics with return per - :ref:`normalization unit `. - - - Cycles per :ref:`normalization unit ` - - * - Bank Conflicts - - - The total number of cycles spent in the :ref:`LDS scheduler ` - due to bank conflicts (as determined by the conflict resolution hardware) - per :ref:`normalization unit `. - - - Cycles per :ref:`normalization unit ` - - * - Address Conflicts - - - The total number of cycles spent in the :ref:`LDS scheduler ` - due to address conflicts (as determined by the conflict resolution - hardware) per :ref:`normalization unit `. - - - Cycles per :ref:`normalization unit ` - - * - Unaligned Stall - - - The total number of cycles spent in the :ref:`LDS scheduler ` - due to stalls from non-dword aligned addresses per - :ref:`normalization unit `. - - - Cycles per :ref:`normalization unit ` - - * - Memory Violations - - - The total number of out-of-bounds accesses made to the LDS, per - :ref:`normalization unit `. This is unused and - expected to be zero in most configurations for modern CDNA™ accelerators. - - - Accesses per :ref:`normalization unit ` +.. jinja:: lds-stats + :file: _templates/metrics_table.j2 diff --git a/projects/rocprofiler-compute/docs/conceptual/pipeline-metrics.rst b/projects/rocprofiler-compute/docs/conceptual/pipeline-metrics.rst index 7c37ee846a..499c8e2b6f 100644 --- a/projects/rocprofiler-compute/docs/conceptual/pipeline-metrics.rst +++ b/projects/rocprofiler-compute/docs/conceptual/pipeline-metrics.rst @@ -23,97 +23,8 @@ Wavefront launch stats The wavefront launch stats panel gives general information about the kernel launch: -.. list-table:: - :header-rows: 1 - :widths: 20 65 15 - - * - Metric - - - Description - - - Unit - - * - Grid Size - - - The total number of work-items (or, threads) launched as a part of - the kernel dispatch. In HIP, this is equivalent to the total grid size - multiplied by the total workgroup (or, block) size. - - - :ref:`Work-items ` - - * - Workgroup Size - - - The total number of work-items (or, threads) in each workgroup - (or, block) launched as part of the kernel dispatch. In HIP, this is - equivalent to the total block size. - - - :ref:`Work-items ` - - * - Total Wavefronts - - - The total number of wavefronts launched as part of the kernel dispatch. - On AMD Instinct™ CDNA™ accelerators and GCN™ GPUs, the wavefront size is - always 64 work-items. Thus, the total number of wavefronts should be - equivalent to the ceiling of grid size divided by 64. - - - :ref:`Wavefronts ` - - * - Saved Wavefronts - - - The total number of wavefronts saved at a context-save. See - `cwsr_enable `_. - - - :ref:`Wavefronts ` - - * - Restored Wavefronts - - - The total number of wavefronts restored from a context-save. See - `cwsr_enable `_. - - - :ref:`Wavefronts ` - - * - VGPRs - - - The number of architected vector general-purpose registers allocated for - the kernel, see :ref:`VALU `. Note: this may not exactly - match the number of VGPRs requested by the compiler due to allocation - granularity. - - - :ref:`VGPRs ` - - * - AGPRs - - - The number of accumulation vector general-purpose registers allocated for - the kernel, see :ref:`AGPRs `. Note: this may not exactly - match the number of AGPRs requested by the compiler due to allocation - granularity. - - - :ref:`AGPRs ` - - * - SGPRs - - - The number of scalar general-purpose registers allocated for the kernel, - see :ref:`SALU `. Note: this may not exactly match the number - of SGPRs requested by the compiler due to allocation granularity. - - - :ref:`SGPRs ` - - * - LDS Allocation - - - The number of bytes of :doc:`LDS ` memory (or, shared - memory) allocated for this kernel. Note: This may also be larger than - what was requested at compile time due to both allocation granularity and - dynamic per-dispatch LDS allocations. - - - Bytes per :ref:`workgroup ` - - * - Scratch Allocation - - - The number of bytes of :ref:`scratch memory ` requested - per work-item for this kernel. Scratch memory is used for stack memory - on the accelerator, as well as for register spills and restores. - - - Bytes per :ref:`work-item ` +.. jinja:: wavefront-launch-stats + :file: _templates/metrics_table.j2 .. _wavefront-runtime-stats: @@ -123,96 +34,8 @@ Wavefront runtime stats The wavefront runtime statistics gives a high-level overview of the execution of wavefronts in a kernel: -.. list-table:: - :header-rows: 1 - :widths: 18 65 17 - - * - Metric - - - Description - - - Unit - - * - :ref:`Kernel time ` - - - The total duration of the executed kernel. Note: this should not be - directly compared to the wavefront cycles / timings below. - - - Nanoseconds - - * - :ref:`Kernel cycles ` - - - The total duration of the executed kernel in cycles. Note: this should - not be directly compared to the wavefront cycles / timings below. - - - Cycles - - * - Instructions per wavefront - - - The average number of instructions (of all types) executed per wavefront. - This is averaged over all wavefronts in a kernel dispatch. - - - Instructions / wavefront - - * - Wave cycles - - - The number of cycles a wavefront in the kernel dispatch spent resident on - a compute unit per :ref:`normalization unit `. This - is averaged over all wavefronts in a kernel dispatch. Note: this should - not be directly compared to the kernel cycles above. - - - Cycles per :ref:`normalization unit ` - - * - Dependency wait cycles - - - The number of cycles a wavefront in the kernel dispatch stalled waiting - on memory of any kind (e.g., instruction fetch, vector or scalar memory, - etc.) per :ref:`normalization unit `. This counter - is incremented at every cycle by *all* wavefronts on a CU stalled at a - memory operation. As such, it is most useful to get a sense of how waves - were spending their time, rather than identification of a precise limiter - because another wave could be actively executing while a wave is stalled. - The sum of this metric, Issue Wait Cycles and Active Cycles should be - equal to the total Wave Cycles metric. - - - Cycles per :ref:`normalization unit ` - - * - Issue Wait Cycles - - - The number of cycles a wavefront in the kernel dispatch was unable to - issue an instruction for any reason (e.g., execution pipe back-pressure, - arbitration loss, etc.) per - :ref:`normalization unit `. This counter is - incremented at every cycle by *all* wavefronts on a CU unable to issue an - instruction. As such, it is most useful to get a sense of how waves were - spending their time, rather than identification of a precise limiter - because another wave could be actively executing while a wave is issue - stalled. The sum of this metric, Dependency Wait Cycles and Active - Cycles should be equal to the total Wave Cycles metric. - - - Cycles per :ref:`normalization unit ` - - * - Active Cycles - - - The average number of cycles a wavefront in the kernel dispatch was - actively executing instructions per - :ref:`normalization unit `. This measurement is made - on a per-wavefront basis, and may include cycles that another wavefront - spent actively executing (on another execution unit, for example) or was - stalled. As such, it is most useful to get a sense of how waves were - spending their time, rather than identification of a precise limiter. The - sum of this metric, Issue Wait Cycles and Active Wait Cycles should be - equal to the total Wave Cycles metric. - - - Cycles per :ref:`normalization unit ` - - * - Wavefront Occupancy - - - The time-averaged number of wavefronts resident on the accelerator over - the lifetime of the kernel. Note: this metric may be inaccurate for - short-running kernels (less than 1ms). - - - :ref:`Wavefronts ` +.. jinja:: wavefront-runtime-stats + :file: _templates/metrics_table.j2 .. note:: @@ -256,71 +79,8 @@ This panel shows the total number of each type of instruction issued to the :doc:`various compute pipelines ` on the :doc:`CU `. These are: -.. list-table:: - :header-rows: 1 - - * - Metric - - - Description - - - Unit - - * - :ref:`VALU ` instructions - - - The total number of vector arithmetic logic unit (VALU) operations - issued. These are the workhorses of the - :doc:`compute unit `, and are used to execute a wide range of - instruction types including floating point operations, non-uniform - address calculations, transcendental operations, integer operations, - shifts, conditional evaluation, etc. - - - Instructions - - * - VMEM instructions - - - The total number of vector memory operations issued. These include most - loads, stores and atomic operations and all accesses to - :ref:`generic, global, private and texture ` memory. - - - Instructions - - * - :doc:`LDS ` instructions - - - The total number of LDS (also known as shared memory) operations issued. - These include loads, stores, atomics, and HIP's ``__shfl`` operations. - - - Instructions - - * - :ref:`MFMA ` instructions - - - The total number of matrix fused multiply-add instructions issued. - - - Instructions - - * - :ref:`SALU ` instructions - - - The total number of scalar arithmetic logic unit (SALU) operations - issued. Typically these are used for address calculations, literal - constants, and other operations that are *provably* uniform across a - wavefront. Although scalar memory (SMEM) operations are issued by the - SALU, they are counted separately in this section. - - - Instructions - - * - SMEM instructions - - - The total number of scalar memory (SMEM) operations issued. These are - typically used for loading kernel arguments, base-pointers and loads - from HIP's ``__constant__`` memory. - - - Instructions - - * - :ref:`Branch ` instructions - - - The total number of branch operations issued. These typically consist of - jump or branch operations and are used to implement control flow. - - - Instructions +.. jinja:: instruction-mix + :file: _templates/metrics_table.j2 .. note:: @@ -345,133 +105,8 @@ include :ref:`MFMA ` instructions using the same precision; for instance, the “F16-ADD” metric does not include any 16-bit floating point additions executed as part of an MFMA instruction using the same precision. -.. list-table:: - :header-rows: 1 - :widths: 15 65 20 - - * - Metric - - - Description - - - Unit - - * - INT32 - - - The total number of instructions operating on 32-bit integer operands - issued to the VALU per :ref:`normalization unit `. - - - Instructions per :ref:`normalization unit ` - - * - INT64 - - - The total number of instructions operating on 64-bit integer operands - issued to the VALU per :ref:`normalization unit `. - - - Instructions per :ref:`normalization unit ` - - * - F16-ADD - - - The total number of addition instructions operating on 16-bit - floating-point operands issued to the VALU per - :ref:`normalization unit `. - - - Instructions per :ref:`normalization unit ` - - * - F16-MUL - - - The total number of multiplication instructions operating on 16-bit - floating-point operands issued to the VALU per - :ref:`normalization unit `. - - - Instructions per :ref:`normalization unit ` - - * - F16-FMA - - - The total number of fused multiply-add instructions operating on 16-bit - floating-point operands issued to the VALU per - :ref:`normalization unit `. - - - Instructions per :ref:`normalization unit ` - - * - F16-TRANS - - - The total number of transcendental instructions (e.g., `sqrt`) operating - on 16-bit floating-point operands issued to the VALU per - :ref:`normalization unit `. - - - Instructions per :ref:`normalization unit ` - - * - F32-ADD - - - The total number of addition instructions operating on 32-bit - floating-point operands issued to the VALU per - :ref:`normalization unit `. - - - Instructions per :ref:`normalization unit ` - - * - F32-MUL - - - The total number of multiplication instructions operating on 32-bit - floating-point operands issued to the VALU per - :ref:`normalization unit `. - - - Instructions per :ref:`normalization unit ` - - * - F32-FMA - - - The total number of fused multiply-add instructions operating on 32-bit - floating-point operands issued to the VALU per - :ref:`normalization unit `. - - - Instructions per :ref:`normalization unit ` - - * - F32-TRANS - - - The total number of transcendental instructions (such as ``sqrt``) - operating on 32-bit floating-point operands issued to the VALU per - :ref:`normalization unit `. - - - Instructions per :ref:`normalization unit ` - - * - F64-ADD - - - The total number of addition instructions operating on 64-bit - floating-point operands issued to the VALU per - :ref:`normalization unit `. - - - Instructions per :ref:`normalization unit ` - - * - F64-MUL - - - The total number of multiplication instructions operating on 64-bit - floating-point operands issued to the VALU per - :ref:`normalization unit `. - - - Instructions per :ref:`normalization unit ` - - * - F64-FMA - - - The total number of fused multiply-add instructions operating on 64-bit - floating-point operands issued to the VALU per - :ref:`normalization unit `. - - - Instructions per :ref:`normalization unit ` - - * - F64-TRANS - - - The total number of transcendental instructions (such as `sqrt`) - operating on 64-bit floating-point operands issued to the VALU per - :ref:`normalization unit `. - - - Instructions per :ref:`normalization unit ` - - * - Conversion - - - The total number of type conversion instructions (such as converting data - to or from F32↔F64) issued to the VALU per - :ref:`normalization unit `. - - - Instructions per :ref:`normalization unit ` +.. jinja:: valu-arith-instruction-mix + :file: _templates/metrics_table.j2 For an example of these counters in action, refer to :ref:`valu-arith-instruction-mix-ex`. @@ -502,57 +137,8 @@ This section details the types of Matrix Fused Multiply-Add MFMA instructions are classified by the type of input data they operate on, and *not* the data type the result is accumulated to. -.. list-table:: - :header-rows: 1 - :widths: 25 60 17 - - * - Metric - - - Description - - - Unit - - * - MFMA-I8 Instructions - - - The total number of 8-bit integer :ref:`MFMA ` instructions - issued per :ref:`normalization unit `. - - - Instructions per :ref:`normalization unit ` - - * - MFMA-F8 Instructions - - - The total number of 8-bit floating point :ref:`MFMA ` - instructions issued per :ref:`normalization unit `. This is supported in AMD Instinct MI300 series and later only. - - - Instructions per :ref:`normalization unit ` - - * - MFMA-F16 Instructions - - - The total number of 16-bit floating point :ref:`MFMA ` - instructions issued per :ref:`normalization unit `. - - - Instructions per :ref:`normalization unit ` - - * - MFMA-BF16 Instructions - - - The total number of 16-bit brain floating point :ref:`MFMA ` - instructions issued per :ref:`normalization unit `. - - - Instructions per :ref:`normalization unit ` - - * - MFMA-F32 Instructions - - - The total number of 32-bit floating-point :ref:`MFMA ` - instructions issued per :ref:`normalization unit `. - - - Instructions per :ref:`normalization unit ` - - * - MFMA-F64 Instructions - - - The total number of 64-bit floating-point :ref:`MFMA ` - instructions issued per :ref:`normalization unit `. - - - Instructions per :ref:`normalization unit ` +.. jinja:: mfma-instruction-mix + :file: _templates/metrics_table.j2 Compute pipeline ================ @@ -612,84 +198,8 @@ various precisions. We note that unlike the are reported as FLOPs and IOPs, that is, the total number of operations executed. -.. list-table:: - :header-rows: 1 - - * - Metric - - - Description - - - Unit - - * - VALU FLOPs - - - The total floating-point operations executed per second on the - :ref:`VALU `. This is also presented as a percent of the peak - theoretical FLOPs achievable on the specific accelerator. Note: this does - not include any floating-point operations from :ref:`MFMA ` - instructions. - - - GFLOPs - - * - VALU IOPs - - - The total integer operations executed per second on the - :ref:`VALU `. This is also presented as a percent of the peak - theoretical IOPs achievable on the specific accelerator. Note: this does - not include any integer operations from :ref:`MFMA ` - instructions. - - - GIOPs - - * - MFMA FLOPs (BF16) - - - The total number of 16-bit brain floating point :ref:`MFMA ` - operations executed per second. Note: this does not include any 16-bit - brain floating point operations from :ref:`VALU ` - instructions. This is also presented as a percent of the peak theoretical - BF16 MFMA operations achievable on the specific accelerator. - - - GFLOPs - - * - MFMA FLOPs (F16) - - - The total number of 16-bit floating point :ref:`MFMA ` - operations executed per second. Note: this does not include any 16-bit - floating point operations from :ref:`VALU ` instructions. This - is also presented as a percent of the peak theoretical F16 MFMA - operations achievable on the specific accelerator. - - - GFLOPs - - * - MFMA FLOPs (F32) - - - The total number of 32-bit floating point :ref:`MFMA ` - operations executed per second. Note: this does not include any 32-bit - floating point operations from :ref:`VALU ` instructions. This - is also presented as a percent of the peak theoretical F32 MFMA - operations achievable on the specific accelerator. - - - GFLOPs - - * - MFMA FLOPs (F64) - - - The total number of 64-bit floating point :ref:`MFMA ` - operations executed per second. Note: this does not include any 64-bit - floating point operations from :ref:`VALU ` instructions. This - is also presented as a percent of the peak theoretical F64 MFMA - operations achievable on the specific accelerator. - - - GFLOPs - - * - MFMA IOPs (INT8) - - - The total number of 8-bit integer :ref:`MFMA ` operations - executed per second. Note: this does not include any 8-bit integer - operations from :ref:`VALU ` instructions. This is also - presented as a percent of the peak theoretical INT8 MFMA operations - achievable on the specific accelerator. - - - GIOPs +.. jinja:: compute-speed-of-light + :file: _templates/metrics_table.j2 .. _pipeline-stats: @@ -702,120 +212,8 @@ various execution units on the :doc:`CU `. Refer to :ref:`scheduler ` the for a high-level overview of execution units and instruction issue. -.. list-table:: - :header-rows: 1 - :widths: 20 65 15 - - * - Metric - - - Description - - - Unit - - * - IPC - - - The ratio of the total number of instructions executed on the - :doc:`CU ` over the - :ref:`total active CU cycles `. - - - Instructions per-cycle - - * - IPC (Issued) - - - The ratio of the total number of - (non-:ref:`internal `) instructions issued over - the number of cycles where the :ref:`scheduler ` was - actively working on issuing instructions. Refer to the - :ref:`Issued IPC ` example for further detail. - - - Instructions per-cycle - - * - SALU utilization - - - Indicates what percent of the kernel's duration the - :ref:`SALU ` was busy executing instructions. Computed as the - ratio of the total number of cycles spent by the - :ref:`scheduler ` issuing SALU / :ref:`SMEM ` - instructions over the :ref:`total CU cycles `. - - - Percent - - * - VALU utilization - - - Indicates what percent of the kernel's duration the - :ref:`VALU ` was busy executing instructions. Does not include - :ref:`VMEM ` operations. Computed as the ratio of the total - number of cycles spent by the :ref:`scheduler ` issuing - VALU instructions over the :ref:`total CU cycles `. - - - Percent - - * - VMEM utilization - - - Indicates what percent of the kernel's duration the - :ref:`VMEM ` unit was busy executing instructions, including - both global/generic and spill/scratch operations (see the - :ref:`VMEM instruction count metrics ` for more - detail). Does not include :ref:`VALU ` operations. Computed - as the ratio of the total number of cycles spent by the - :ref:`scheduler ` issuing VMEM instructions over the - :ref:`total CU cycles `. - - - Percent - - * - Branch utilization - - - Indicates what percent of the kernel's duration the - :ref:`branch ` unit was busy executing instructions. - Computed as the ratio of the total number of cycles spent by the - :ref:`scheduler ` issuing branch instructions over the - :ref:`total CU cycles `. - - - Percent - - * - VALU active threads - - - Indicates the average level of :ref:`divergence ` within - a wavefront over the lifetime of the kernel. The number of work-items - that were active in a wavefront during execution of each - :ref:`VALU ` instruction, time-averaged over all VALU - instructions run on all wavefronts in the kernel. - - - Work-items - - * - MFMA utilization - - - Indicates what percent of the kernel's duration the - :ref:`MFMA ` unit was busy executing instructions. Computed as - the ratio of the total number of cycles spent by the - :ref:`MFMA ` was busy over the - :ref:`total CU cycles `. - - - Percent - - * - MFMA instruction cycles - - - The average duration of :ref:`MFMA ` instructions in this - kernel in cycles. Computed as the ratio of the total number of cycles the - MFMA unit was busy over the total number of MFMA instructions. Compare - to, for example, the - `AMD Matrix Instruction Calculator `_. - - - Cycles per instruction - - * - VMEM latency - - - The average number of round-trip cycles (that is, from issue to data - return / acknowledgment) required for a VMEM instruction to complete. - - - Cycles - - * - SMEM latency - - - The average number of round-trip cycles (that is, from issue to data - return / acknowledgment) required for a SMEM instruction to complete. - - - Cycles +.. jinja:: pipeline-stats + :file: _templates/metrics_table.j2 .. note:: @@ -846,70 +244,5 @@ not. For more detail on how operations are counted see the take into account the execution mask of the operation, and will report the same value even if EXEC is identically zero. -.. list-table:: - :header-rows: 1 - :widths: 18 65 17 - - * - Metric - - - Description - - - Unit - - * - FLOPs (Total) - - - The total number of floating-point operations executed on either the - :ref:`VALU ` or :ref:`MFMA ` units, per - :ref:`normalization unit `. - - - FLOP per :ref:`normalization unit ` - - * - IOPs (Total) - - - The total number of integer operations executed on either the - :ref:`VALU ` or :ref:`MFMA ` units, per - :ref:`normalization unit `. - - - IOP per :ref:`normalization unit ` - - * - F16 OPs - - - The total number of 16-bit floating-point operations executed on either the - :ref:`VALU ` or :ref:`MFMA ` units, per - :ref:`normalization unit `. - - - FLOP per :ref:`normalization unit ` - - * - BF16 OPs - - - The total number of 16-bit brain floating-point operations executed on either the - :ref:`VALU ` or :ref:`MFMA ` units, per - :ref:`normalization unit `. Note: on current CDNA - accelerators, the VALU has no native BF16 instructions. - - - FLOP per :ref:`normalization unit ` - - * - F32 OPs - - - The total number of 32-bit floating-point operations executed on either - the :ref:`VALU ` or :ref:`MFMA ` units, per - :ref:`normalization unit `. - - - FLOP per :ref:`normalization unit ` - - * - F64 OPs - - - The total number of 64-bit floating-point operations executed on either - the :ref:`VALU ` or :ref:`MFMA ` units, per - :ref:`normalization unit `. - - - FLOP per :ref:`normalization unit ` - - * - INT8 OPs - - - The total number of 8-bit integer operations executed on either the - :ref:`VALU ` or :ref:`MFMA ` units, per - :ref:`normalization unit `. Note: on current CDNA - accelerators, the VALU has no native INT8 instructions. - - - IOPs per :ref:`normalization unit ` +.. jinja:: arithmetic-operations + :file: _templates/metrics_table.j2 \ No newline at end of file diff --git a/projects/rocprofiler-compute/docs/conceptual/shader-engine.rst b/projects/rocprofiler-compute/docs/conceptual/shader-engine.rst index 350ea4624f..83995d66fb 100644 --- a/projects/rocprofiler-compute/docs/conceptual/shader-engine.rst +++ b/projects/rocprofiler-compute/docs/conceptual/shader-engine.rst @@ -71,40 +71,8 @@ Scalar L1D Speed-of-Light The Scalar L1D speed-of-light chart shows some key metrics of the sL1D cache as a comparison with the peak achievable values of those metrics: -.. list-table:: - :header-rows: 1 - :widths: 20 65 15 - - * - Metric - - - Description - - - Unit - - * - Bandwidth - - - The number of bytes looked up in the sL1D cache, as a percent of the peak - theoretical bandwidth. Calculated as the ratio of sL1D requests over the - :ref:`total sL1D cycles `. - - - Percent - - * - Cache Hit Rate - - - The percent of sL1D requests that hit [#sl1d-cache]_ on a previously - loaded line in the cache. Calculated as the ratio of the number of sL1D - requests that hit over the number of all sL1D requests. - - - Percent - - * - sL1D-L2 BW - - - The number of bytes requested by the sL1D from the L2 cache, as a percent - of the peak theoretical sL1D → L2 cache bandwidth. Calculated as the - ratio of the total number of requests from the sL1D to the L2 cache over - the :ref:`total sL1D-L2 interface cycles `. - - - Percent +.. jinja:: desc-sl1d-sol + :file: _templates/metrics_table.j2 .. _desc-sl1d-stats: @@ -114,104 +82,8 @@ Scalar L1D cache accesses This panel gives more detail on the types of accesses made to the sL1D, and the hit/miss statistics. -.. list-table:: - :header-rows: 1 - - * - Metric - - - Description - - - Unit - - * - Requests - - - The total number of requests, of any size or type, made to the sL1D per - :ref:`normalization unit `. - - - Requests per :ref:`normalization unit ` - - * - Hits - - - The total number of sL1D requests that hit on a previously loaded cache - line, per :ref:`normalization unit `. - - - Requests per :ref:`normalization unit ` - - * - Misses - Non Duplicated - - - The total number of sL1D requests that missed on a cache line that *was - not* already pending due to another request, per - :ref:`normalization unit `. See :ref:`desc-sl1d-sol` - for more detail. - - - Requests per :ref:`normalization unit ` - - * - Misses - Duplicated - - - The total number of sL1D requests that missed on a cache line that *was* - already pending due to another request, per - :ref:`normalization unit `. See - :ref:`desc-sl1d-sol` for more detail. - - - Requests per :ref:`normalization unit ` - - * - Cache Hit Rate - - - Indicates the percent of sL1D requests that hit on a previously loaded - line the cache. The ratio of the number of sL1D requests that hit - [#sl1d-cache]_ over the number of all sL1D requests. - - - Percent - - * - Read Requests (Total) - - - The total number of sL1D read requests of any size, per - :ref:`normalization unit `. - - - Requests per :ref:`normalization unit ` - - * - Atomic Requests - - - The total number of sL1D atomic requests of any size, per - :ref:`normalization unit `. Typically unused on CDNA - accelerators. - - - Requests per :ref:`normalization unit ` - - * - Read Requests (1 DWord) - - - The total number of sL1D read requests made for a single dword of data - (4B), per :ref:`normalization unit `. - - - Requests per :ref:`normalization unit ` - - * - Read Requests (2 DWord) - - - The total number of sL1D read requests made for a two dwords of data - (8B), per :ref:`normalization unit `. - - - Requests per :ref:`normalization unit ` - - * - Read Requests (4 DWord) - - - The total number of sL1D read requests made for a four dwords of data - (16B), per :ref:`normalization unit `. - - - Requests per :ref:`normalization unit ` - - * - Read Requests (8 DWord) - - - The total number of sL1D read requests made for a eight dwords of data - (32B), per :ref:`normalization unit `. - - - Requests per :ref:`normalization unit ` - - * - Read Requests (16 DWord) - - - The total number of sL1D read requests made for a sixteen dwords of data - (64B), per :ref:`normalization unit `. - - - Requests per :ref:`normalization unit ` +.. jinja:: desc-sl1d-stats + :file: _templates/metrics_table.j2 .. _desc-sl1d-l2-interface: @@ -222,56 +94,8 @@ This panel gives more detail on the data requested across the sL1D↔ :doc:`L2 ` interface. -.. list-table:: - :header-rows: 1 - - * - Metric - - - Description - - - Unit - - * - sL1D-L2 BW - - - The total number of bytes read from, written to, or atomically updated - across the sL1D↔:doc:`L2 ` interface, per - :ref:`normalization unit `. Note that sL1D writes - and atomics are typically unused on current CDNA accelerators, so in the - majority of cases this can be interpreted as an sL1D→L2 read bandwidth. - - - Bytes per :ref:`normalization unit ` - - * - Read Requests - - - The total number of read requests from sL1D to the :doc:`L2 `, - per :ref:`normalization unit `. - - - Requests per :ref:`normalization unit ` - - * - Write Requests - - - The total number of write requests from sL1D to the :doc:`L2 `, - per :ref:`normalization unit `. Typically unused on - current CDNA accelerators. - - - Requests per :ref:`normalization unit ` - - * - Atomic Requests - - - The total number of atomic requests from sL1D to the - :doc:`L2 `, per - :ref:`normalization unit `. Typically unused on - current CDNA accelerators. - - - Requests per :ref:`normalization unit ` - - * - Stall Cycles - - - The total number of cycles the sL1D↔ - :doc:`L2 ` interface was stalled, per - :ref:`normalization unit `. - - - Cycles per :ref:`normalization unit ` +.. jinja:: desc-sl1d-l2-interface + :file: _templates/metrics_table.j2 .. rubric:: Footnotes @@ -318,46 +142,8 @@ The L1 Instruction Cache speed-of-light chart shows some key metrics of the L1I cache as a comparison with the peak achievable values of those metrics: -.. list-table:: - :header-rows: 1 - - * - Metric - - - Description - - - Unit - - * - Bandwidth - - - The number of bytes looked up in the L1I cache, as a percent of the peak - theoretical bandwidth. Calculated as the ratio of L1I requests over the - :ref:`total L1I cycles `. - - - Percent - - * - Cache Hit Rate - - - The percent of L1I requests that hit on a previously loaded line the - cache. Calculated as the ratio of the number of L1I requests that hit - [#l1i-cache]_ over the number of all L1I requests. - - - Percent - - * - L1I-L2 BW - - - The percent of the peak theoretical L1I → L2 cache request bandwidth - achieved. Calculated as the ratio of the total number of requests from - the L1I to the L2 cache over the - :ref:`total L1I-L2 interface cycles `. - - - Percent - - * - Instruction Fetch Latency - - - The average number of cycles spent to fetch instructions to a - :doc:`CU `. - - - Cycles +.. jinja:: desc-l1i-sol + :file: _templates/metrics_table.j2 .. _desc-l1i-stats: @@ -366,54 +152,10 @@ L1I cache accesses This panel gives more detail on the hit/miss statistics of the L1I: -.. list-table:: - :header-rows: 1 +.. jinja:: desc-l1i-stats + :file: _templates/metrics_table.j2 - * - Metric - - - Description - - - Unit - - * - Requests - - - The total number of requests made to the L1I per - :ref:`normalization-unit `. - - - Requests per :ref:`normalization unit `. - - * - Hits - - - The total number of L1I requests that hit on a previously loaded cache - line, per :ref:`normalization-unit `. - - - Requests per :ref:`normalization unit ` - - * - Misses - Non Duplicated - - - The total number of L1I requests that missed on a cache line that - *were not* already pending due to another request, per - :ref:`normalization-unit `. See note in - :ref:`desc-l1i-sol` for more detail. - - - Requests per :ref:`normalization unit `. - - * - Misses - Duplicated - - - The total number of L1I requests that missed on a cache line that *were* - already pending due to another request, per - :ref:`normalization-unit `. See note in - :ref:`desc-l1i-sol` for more detail. - - - Requests per :ref:`normalization unit ` - - * - Cache Hit Rate - - - The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded - line the cache. Calculated as the ratio of the number of L1I requests - that hit over the number of all L1I requests. - - - Percent +.. _desc-l1i-l2-interface: L1I - L2 interface ------------------ @@ -421,21 +163,8 @@ L1I - L2 interface This panel gives more detail on the data requested across the L1I-:doc:`L2 ` interface. -.. list-table:: - :header-rows: 1 - - * - Metric - - - Description - - - Unit - - * - L1I-L2 BW - - - The total number of bytes read across the L1I-:doc:`L2 ` - interface, per :ref:`normalization unit `. - - - Bytes per :ref:`normalization unit ` +.. jinja:: desc-l1i-l2-interface + :file: _templates/metrics_table.j2 .. rubric:: Footnotes @@ -493,90 +222,18 @@ issuing concurrently). kernels). This means that these scheduler-pipe utilization metrics are expected to reach (for example) a maximum of one pipe active -- only 25%. +.. _spi-util: + Workgroup manager utilizations ------------------------------ This section describes the utilization of the workgroup manager, and the hardware components it interacts with. -.. list-table:: - :header-rows: 1 - :widths: 20 65 15 +.. jinja:: spi-util + :file: _templates/metrics_table.j2 - * - Metric - - - Description - - - Unit - - * - Accelerator utilization - - - The percent of cycles in the kernel where the accelerator was actively - doing any work. - - - Percent - - * - Scheduler-pipe utilization - - - The percent of :ref:`total scheduler-pipe cycles ` in - the kernel where the scheduler-pipes were actively doing any work. Note: - this value is expected to range between 0% and 25%. See :ref:`desc-spi`. - - - Percent - - * - Workgroup manager utilization - - - The percent of cycles in the kernel where the workgroup manager was - actively doing any work. - - - Percent - - * - Shader engine utilization - - - The percent of :ref:`total shader engine cycles ` in the - kernel where any CU in a shader-engine was actively doing any work, - normalized over all shader-engines. Low values (e.g., << 100%) indicate - that the accelerator was not fully saturated by the kernel, or a - potential load-imbalance issue. - - - Percent - - * - SIMD utilization - - - The percent of :ref:`total SIMD cycles ` in the kernel - where any :ref:`SIMD ` on a CU was actively doing any work, - summed over all CUs. Low values (less than 100%) indicate that the - accelerator was not fully saturated by the kernel, or a potential - load-imbalance issue. - - - Percent - - * - Dispatched workgroups - - - The total number of workgroups forming this kernel launch. - - - Workgroups - - * - Dispatched wavefronts - - - The total number of wavefronts, summed over all workgroups, forming this - kernel launch. - - - Wavefronts - - * - VGPR writes - - - The average number of cycles spent initializing :ref:`VGPRs ` - at wave creation. - - - Cycles/wave - - * - SGPR Writes - - - The average number of cycles spent initializing :ref:`SGPRs ` - at wave creation. - - - Cycles/wave +.. _spi-resc-util: Resource allocation ------------------- @@ -590,117 +247,5 @@ limited by LDS usage, for example, but may still achieve high occupancy levels such that improving occupancy further may not improve performance. See :ref:`occupancy-example` for details. -.. list-table:: - :header-rows: 1 - - * - Metric - - - Description - - - Unit - - * - Not-scheduled rate (Workgroup Manager) - - - The percent of :ref:`total scheduler-pipe cycles ` in - the kernel where a workgroup could not be scheduled to a - :doc:`CU ` due to a bottleneck within the workgroup manager - rather than a lack of a CU or :ref:`SIMD ` with sufficient - resources. Note: this value is expected to range between 0-25%. See note - in :ref:`workgroup manager ` description. - - - Percent - - * - Not-scheduled rate (Scheduler-Pipe) - - - The percent of :ref:`total scheduler-pipe cycles ` in - the kernel where a workgroup could not be scheduled to a - :doc:`CU ` due to a bottleneck within the scheduler-pipes - rather than a lack of a CU or :ref:`SIMD ` with sufficient - resources. Note: this value is expected to range between 0-25%, see note - in :ref:`workgroup manager ` description. - - - Percent - - * - Scheduler-Pipe Stall Rate - - - The percent of :ref:`total scheduler-pipe cycles ` in - the kernel where a workgroup could not be scheduled to a - :doc:`CU ` due to occupancy limitations (like a lack of a - CU or :ref:`SIMD ` with sufficient resources). Note: this - value is expected to range between 0-25%, see note in - :ref:`workgroup manager ` description. - - - Percent - - * - Scratch Stall Rate - - - The percent of :ref:`total shader-engine cycles ` in the - kernel where a workgroup could not be scheduled to a - :doc:`CU ` due to lack of - :ref:`private (a.k.a., scratch) memory ` slots. While this - can reach up to 100%, note that the actual occupancy limitations on a - kernel using private memory are typically quite small (for example, less - than 1% of the total number of waves that can be scheduled to an - accelerator). - - - Percent - - * - Insufficient SIMD Waveslots - - - The percent of :ref:`total SIMD cycles ` in the kernel - where a workgroup could not be scheduled to a :ref:`SIMD ` - due to lack of available :ref:`waveslots `. - - - Percent - - * - Insufficient SIMD VGPRs - - - The percent of :ref:`total SIMD cycles ` in the kernel - where a workgroup could not be scheduled to a :ref:`SIMD ` - due to lack of available :ref:`VGPRs `. - - - Percent - - * - Insufficient SIMD SGPRs - - - The percent of :ref:`total SIMD cycles ` in the kernel - where a workgroup could not be scheduled to a :ref:`SIMD ` - due to lack of available :ref:`SGPRs `. - - - Percent - - * - Insufficient CU LDS - - - The percent of :ref:`total CU cycles ` in the kernel - where a workgroup could not be scheduled to a :doc:`CU ` - due to lack of available :doc:`LDS `. - - - Percent - - * - Insufficient CU Barriers - - - The percent of :ref:`total CU cycles ` in the kernel - where a workgroup could not be scheduled to a :doc:`CU ` - due to lack of available :ref:`barriers `. - - - Percent - - * - Reached CU Workgroup Limit - - - The percent of :ref:`total CU cycles ` in the kernel - where a workgroup could not be scheduled to a :doc:`CU ` - due to limits within the workgroup manager. This is expected to be - always be zero on CDNA2 or newer accelerators (and small for previous - accelerators). - - - Percent - - * - Reached CU Wavefront Limit - - - The percent of :ref:`total CU cycles ` in the kernel - where a wavefront could not be scheduled to a :doc:`CU ` - due to limits within the workgroup manager. This is expected to be - always be zero on CDNA2 or newer accelerators (and small for previous - accelerators). - - - Percent +.. jinja:: spi-resc-util + :file: _templates/metrics_table.j2 diff --git a/projects/rocprofiler-compute/docs/conceptual/system-speed-of-light.rst b/projects/rocprofiler-compute/docs/conceptual/system-speed-of-light.rst index 5652a5e3f9..324a0f8ef5 100644 --- a/projects/rocprofiler-compute/docs/conceptual/system-speed-of-light.rst +++ b/projects/rocprofiler-compute/docs/conceptual/system-speed-of-light.rst @@ -2,6 +2,8 @@ :description: ROCm Compute Profiler performance model: System Speed-of-Light :keywords: Omniperf, ROCm Compute Profiler, ROCm, profiler, tool, Instinct, accelerator, AMD, system, speed of light +.. _sys-sol: + ********************* System Speed-of-Light ********************* @@ -20,308 +22,5 @@ of ROCm Compute Profiler’s profiling report. Instinct™ MI-series accelerators. For more detail on how operations are counted, see the :ref:`metrics-flop-count` section. -.. list-table:: - :header-rows: 1 - - * - Metric - - - Description - - - Unit - - * - :ref:`VALU ` FLOPs - - - The total floating-point operations executed per second on the - :ref:`VALU `. This is also presented as a percent of the peak - theoretical FLOPs achievable on the specific accelerator. Note: this does - not include any floating-point operations from :ref:`MFMA ` - instructions. - - - GFLOPs - - * - :ref:`VALU ` IOPs - - - The total integer operations executed per second on the - :ref:`VALU `. This is also presented as a percent of the peak - theoretical IOPs achievable on the specific accelerator. Note: this does - not include any integer operations from :ref:`MFMA ` - instructions. - - - GIOPs - - * - :ref:`MFMA ` FLOPs (F8) - - - The total number of 8-bit floating point :ref:`MFMA ` - operations executed per second. This does not include any 16-bit - brain floating point operations from :ref:`VALU ` - instructions. This is also presented as a percent of the peak theoretical - F8 MFMA operations achievable on the specific accelerator. It is supported on AMD Instinct MI300 series and later only. - - - GFLOPs - - * - :ref:`MFMA ` FLOPs (BF16) - - - The total number of 16-bit brain floating point :ref:`MFMA ` - operations executed per second. Note: this does not include any 16-bit - brain floating point operations from :ref:`VALU ` - instructions. This is also presented as a percent of the peak theoretical - BF16 MFMA operations achievable on the specific accelerator. - - - GFLOPs - - * - :ref:`MFMA ` FLOPs (F16) - - - The total number of 16-bit floating point :ref:`MFMA ` - operations executed per second. Note: this does not include any 16-bit - floating point operations from :ref:`VALU ` instructions. This - is also presented as a percent of the peak theoretical F16 MFMA - operations achievable on the specific accelerator. - - - GFLOPs - - * - :ref:`MFMA ` FLOPs (F32) - - - The total number of 32-bit floating point :ref:`MFMA ` - operations executed per second. Note: this does not include any 32-bit - floating point operations from :ref:`VALU ` instructions. This - is also presented as a percent of the peak theoretical F32 MFMA - operations achievable on the specific accelerator. - - - GFLOPs - - * - :ref:`MFMA ` FLOPs (F64) - - - The total number of 64-bit floating point :ref:`MFMA ` - operations executed per second. Note: this does not include any 64-bit - floating point operations from :ref:`VALU ` instructions. This - is also presented as a percent of the peak theoretical F64 MFMA - operations achievable on the specific accelerator. - - - GFLOPs - - * - :ref:`MFMA ` IOPs (INT8) - - - The total number of 8-bit integer :ref:`MFMA ` operations - executed per second. Note: this does not include any 8-bit integer - operations from :ref:`VALU ` instructions. This is also - presented as a percent of the peak theoretical INT8 MFMA operations - achievable on the specific accelerator. - - - GIOPs - - * - :ref:`SALU ` utilization - - - Indicates what percent of the kernel's duration the - :ref:`SALU ` was busy executing instructions. Computed as the - ratio of the total number of cycles spent by the - :ref:`scheduler ` issuing :ref:`SALU ` or - :ref:`SMEM ` instructions over the - :ref:`total CU cycles `. - - - Percent - - * - :ref:`VALU ` utilization - - - Indicates what percent of the kernel's duration the - :ref:`VALU ` was busy executing instructions. Does not include - :ref:`VMEM ` operations. Computed as the ratio of the total - number of cycles spent by the :ref:`scheduler ` issuing - :ref:`VALU ` instructions over the - :ref:`total CU cycles `. - - - Percent - - * - :ref:`MFMA ` utilization - - - Indicates what percent of the kernel's duration the - :ref:`MFMA ` unit was busy executing instructions. Computed as - the ratio of the total number of cycles the MFMA was busy over the - :ref:`total CU cycles `. - - - Percent - - * - :ref:`VMEM ` utilization - - - Indicates what percent of the kernel's duration the - :ref:`VMEM ` unit was busy executing instructions, including - both global/generic and spill/scratch operations (see the - :ref:`VMEM instruction count metrics `) for more - detail). Does not include :ref:`VALU ` operations. Computed as - the ratio of the total number of cycles spent by the - :ref:`scheduler ` issuing VMEM instructions over the - :ref:`total CU cycles `. - - - Percent - - * - :ref:`Branch ` utilization - - - Indicates what percent of the kernel's duration the - :ref:`branch ` unit was busy executing instructions. - Computed as the ratio of the total number of cycles spent by the - :ref:`scheduler ` issuing :ref:`branch ` - instructions over the :ref:`total CU cycles ` - - - Percent - - * - :ref:`VALU ` active threads - - - Indicates the average level of :ref:`divergence ` within - a wavefront over the lifetime of the kernel. The number of work-items - that were active in a wavefront during execution of each - :ref:`VALU ` instruction, time-averaged over all VALU - instructions run on all wavefronts in the kernel. - - - Work-items - - * - IPC - - - The ratio of the total number of instructions executed on the - :doc:`CU ` over the - :ref:`total active CU cycles `. This is also - presented as a percent of the peak theoretical bandwidth achievable on - the specific accelerator. - - - Instructions per-cycle - - * - Wavefront occupancy - - - The time-averaged number of wavefronts resident on the accelerator over - the lifetime of the kernel. Note: this metric may be inaccurate for - short-running kernels (less than 1ms). This is also presented as a - percent of the peak theoretical occupancy achievable on the specific - accelerator. - - - Wavefronts - - * - :doc:`LDS ` theoretical bandwidth - - - Indicates the maximum amount of bytes that could have been loaded from, - stored to, or atomically updated in the LDS per unit time (see - :ref:`LDS Bandwidth ` example for more detail). This is - also presented as a percent of the peak theoretical F64 MFMA operations - achievable on the specific accelerator. - - - GB/s - - * - :doc:`LDS ` bank conflicts/access - - - The ratio of the number of cycles spent in the - :doc:`LDS scheduler ` due to bank conflicts (as - determined by the conflict resolution hardware) to the base number of - cycles that would be spent in the LDS scheduler in a completely - uncontended case. This is also presented in normalized form (i.e., the - Bank Conflict Rate). - - - Conflicts/Access - - * - :doc:`vL1D ` cache hit rate - - - The ratio of the number of vL1D cache line requests that hit in vL1D - cache over the total number of cache line requests to the - :ref:`vL1D cache RAM `. - - - Percent - - * - :doc:`vL1D ` cache bandwidth - - - The number of bytes looked up in the vL1D cache as a result of - :ref:`VMEM ` instructions per unit time. The number of bytes - is calculated as the number of cache lines requested multiplied by the - cache line size. This value does not consider partial requests, so e.g., - if only a single value is requested in a cache line, the data movement - will still be counted as a full cache line. This is also presented as a - percent of the peak theoretical bandwidth achievable on the specific - accelerator. - - - GB/s - - * - :doc:`L2 ` cache hit rate - - - The ratio of the number of L2 cache line requests that hit in the L2 - cache over the total number of incoming cache line requests to the L2 - cache. - - - Percent - - * - :doc:`L2 ` cache bandwidth - - - The number of bytes looked up in the L2 cache per unit time. The number - of bytes is calculated as the number of cache lines requested multiplied - by the cache line size. This value does not consider partial requests, so - e.g., if only a single value is requested in a cache line, the data - movement will still be counted as a full cache line. This is also - presented as a percent of the peak theoretical bandwidth achievable on - the specific accelerator. - - - GB/s - - * - :doc:`L2 `-fabric read BW - - - The number of bytes read by the L2 over the - :ref:`Infinity Fabric™ interface ` per unit time. This is also - presented as a percent of the peak theoretical bandwidth achievable on - the specific accelerator. - - - GB/s - - * - :doc:`L2 `-fabric write and atomic BW - - - The number of bytes sent by the L2 over the - :ref:`Infinity Fabric interface ` by write and atomic - operations per unit time. This is also presented as a percent of the peak - theoretical bandwidth achievable on the specific accelerator. - - - GB/s - - * - :doc:`L2 `-fabric read latency - - - The time-averaged number of cycles read requests spent in Infinity Fabric - before data was returned to the L2. - - - Cycles - - * - :doc:`L2 `-fabric write latency - - - The time-averaged number of cycles write requests spent in Infinity - Fabric before a completion acknowledgement was returned to the L2. - - - Cycles - - * - :ref:`sL1D ` cache hit rate - - - The percent of sL1D requests that hit on a previously loaded line the - cache. Calculated as the ratio of the number of sL1D requests that hit - over the number of all sL1D requests. - - - Percent - - * - :ref:`sL1D ` bandwidth - - - The number of bytes looked up in the sL1D cache per unit time. This is - also presented as a percent of the peak theoretical bandwidth achievable - on the specific accelerator. - - - GB/s - - * - :ref:`L1I ` bandwidth - - - The number of bytes looked up in the L1I cache per unit time. This is - also presented as a percent of the peak theoretical bandwidth achievable - on the specific accelerator. - - - GB/s - - * - :ref:`L1I ` cache hit rate - - - The percent of L1I requests that hit on a previously loaded line the - cache. Calculated as the ratio of the number of L1I requests that hit - over the number of all L1I requests. - - - Percent - - * - :ref:`L1I ` fetch latency - - - The average number of cycles spent to fetch instructions to a - :doc:`CU `. - - - Cycles +.. jinja:: sys-sol + :file: _templates/metrics_table.j2 diff --git a/projects/rocprofiler-compute/docs/conceptual/vector-l1-cache.rst b/projects/rocprofiler-compute/docs/conceptual/vector-l1-cache.rst index f845b791c8..c78acb4f3c 100644 --- a/projects/rocprofiler-compute/docs/conceptual/vector-l1-cache.rst +++ b/projects/rocprofiler-compute/docs/conceptual/vector-l1-cache.rst @@ -63,53 +63,8 @@ vL1D Speed-of-Light The vL1D’s speed-of-light chart shows several key metrics for the vL1D as a comparison with the peak achievable values of those metrics. -.. list-table:: - :header-rows: 1 - - * - Metric - - - Description - - - Unit - - * - Hit Rate - - - The ratio of the number of vL1D cache line requests that hit [#vl1d-hit]_ - in vL1D cache over the total number of cache line requests to the - :ref:`vL1D Cache RAM `. - - - Percent - - * - Bandwidth - - - The number of bytes looked up in the vL1D cache as a result of - :ref:`VMEM ` instructions, as a percent of the peak - theoretical bandwidth achievable on the specific accelerator. The number - of bytes is calculated as the number of cache lines requested multiplied - by the cache line size. This value does not consider partial requests, so - for instance, if only a single value is requested in a cache line, the - data movement will still be counted as a full cache line. - - - Percent - - * - Utilization - - - Indicates how busy the :ref:`vL1D Cache RAM ` was during the - kernel execution. The number of cycles where the vL1D Cache RAM is - actively processing any request divided by the number of cycles where the - vL1D is active [#vl1d-activity]_. - - - Percent - - * - Coalescing - - - Indicates how well memory instructions were coalesced by the - :ref:`address processing unit `, ranging from uncoalesced (25%) - to fully coalesced (100%). Calculated as the average number of - :ref:`thread-requests ` generated per instruction - divided by the ideal number of thread-requests per instruction. - - - Percent +.. jinja:: vl1d-sol + :file: _templates/metrics_table.j2 .. _desc-ta: @@ -145,45 +100,8 @@ processing unit. When the front-end cannot accept any more addresses, it must backpressure the wave-issue logic for the VMEM pipe and prevent the issue of further vector memory instructions. -.. list-table:: - :header-rows: 1 - - * - Metric - - - Description - - - Unit - - * - Busy - - - Percent of the :ref:`total CU cycles ` the address - processor was busy - - - Percent - - * - Address Stall - - - Percent of the :ref:`total CU cycles ` the address - processor was stalled from sending address requests further into the vL1D - pipeline - - - Percent - - * - Data Stall - - - Percent of the :ref:`total CU cycles ` the address - processor was stalled from sending write/atomic data further into the - vL1D pipeline - - - Percent - - * - Data-Processor → Address Stall - - - Percent of :ref:`total CU cycles ` the address processor - was stalled waiting to send command data to the - :ref:`data processor ` - - - Percent +.. jinja:: ta-busy-stall + :file: _templates/metrics_table.j2 .. _ta-instruction-counts: @@ -232,80 +150,8 @@ kernel. These are broken down into a few major categories: The address processor counts these instruction types as follows: -.. list-table:: - :header-rows: 1 - - * - Type - - - Description - - - Unit - - * - Global/Generic - - - The total number of global & generic memory instructions executed on all - :doc:`compute units ` on the accelerator, per - :ref:`normalization unit `. - - - Instructions per :ref:`normalization unit ` - - * - Global/Generic Read - - - The total number of global & generic memory read instructions executed on - all :doc:`compute units ` on the accelerator, per - :ref:`normalization unit `. - - - Instructions per :ref:`normalization unit ` - - * - Global/Generic Write - - - The total number of global & generic memory write instructions executed - on all :doc:`compute units ` on the accelerator, per - :ref:`normalization unit `. - - - Instructions per :ref:`normalization unit ` - - * - Global/Generic Atomic - - - The total number of global & generic memory atomic (with and without - return) instructions executed on all :doc:`compute units ` - on the accelerator, per :ref:`normalization unit `. - - - Instructions per :ref:`normalization unit ` - - * - Spill/Stack - - - The total number of spill/stack memory instructions executed on all - :doc:`compute units ` on the accelerator, per - :ref:`normalization unit `. - - - Instructions per :ref:`normalization unit ` - - * - Spill/Stack Read - - - The total number of spill/stack memory read instructions executed on all - :doc:`compute units ` on the accelerator, per - :ref:`normalization unit `. - - - Instructions per :ref:`normalization unit ` - - * - Spill/Stack Write - - - The total number of spill/stack memory write instructions executed on all - :doc:`compute units ` on the accelerator, per - :ref:`normalization unit `. - - - Instruction per :ref:`normalization unit ` - - * - Spill/Stack Atomic - - - The total number of spill/stack memory atomic (with and without return) - instructions executed on all :doc:`compute units ` on the - accelerator, per :ref:`normalization unit `. - Typically unused as these memory operations are typically used to - implement thread-local storage. - - - Instructions per :ref:`normalization unit ` +.. jinja:: ta-instruction-counts + :file: _templates/metrics_table.j2 .. note:: @@ -333,38 +179,8 @@ Spill / stack metrics Finally, the address processing unit contains a separate coalescing stage for spill/stack memory, and thus reports: -.. list-table:: - :header-rows: 1 - - * - Metric - - - Description - - - Unit - - * - Spill/Stack Total Cycles - - - The number of cycles the address processing unit spent working on - spill/stack instructions, per - :ref:`normalization unit `. - - - Cycles per :ref:`normalization unit ` - - * - Spill/Stack Coalesced Read Cycles - - - The number of cycles the address processing unit spent working on - coalesced spill/stack read instructions, per - :ref:`normalization unit `. - - - Cycles per :ref:`normalization unit ` - - * - Spill/Stack Coalesced Write Cycles - - - The number of cycles the address processing unit spent working on - coalesced spill/stack write instructions, per - :ref:`normalization unit `. - - - Cycles per :ref:`normalization unit ` +.. jinja:: ta-spill-stack + :file: _templates/metrics_table.j2 .. _desc-utcl1: @@ -380,52 +196,8 @@ reduce the cost of subsequent re-translations. ROCm Compute Profiler reports the following L1 TLB metrics: -.. list-table:: - :header-rows: 1 - - * - Metric - - - Description - - - Unit - - * - Requests - - - The number of translation requests made to the UTCL1 per - :ref:`normalization unit `. - - - Requests per :ref:`normalization unit ` - - * - Hits - - - The number of translation requests that hit in the UTCL1, and could be - reused, per :ref:`normalization unit `. - - - Requests per :ref:`normalization unit ` - - * - Hit Ratio - - - The ratio of the number of translation requests that hit in the UTCL1 - divided by the total number of translation requests made to the UTCL1. - - - Percent - - * - Translation Misses - - - The total number of translation requests that missed in the UTCL1 due to - translation not being present in the cache, per - :ref:`normalization unit `. - - - Requests per :ref:`normalization unit ` - - * - Permission Misses - - - The total number of translation requests that missed in the UTCL1 due to - a permission error, per :ref:`normalization unit `. - This is unused and expected to be zero in most configurations for modern - CDNA™ accelerators. - - - Requests per :ref:`normalization unit ` +.. jinja:: desc-utcl1 + :file: _templates/metrics_table.j2 .. note:: @@ -464,39 +236,8 @@ L2 requests may backpressure the wave-issue logic of the :ref:`VMEM ` pipe and prevent it from issuing more vector memory instructions until the vL1D’s outstanding requests are completed. -.. list-table:: - :header-rows: 1 - - * - Metric - - - Description - - - Unit - - * - Stalled on L2 Data - - - The ratio of the number of cycles where the vL1D is stalled waiting for - requested data to return from the :doc:`L2 cache ` divided by - the number of cycles where the vL1D is active [#vl1d-activity]_. - - - Percent - - * - Stalled on L2 Requests - - - The ratio of the number of cycles where the vL1D is stalled waiting to - issue a request for data to the :doc:`L2 cache ` divided by the - number of cycles where the vL1D is active [#vl1d-activity]_. - - - Percent - - * - Tag RAM Stall (Read/Write/Atomic) - - - The ratio of the number of cycles where the vL1D is stalled due to - Read/Write/Atomic requests with conflicting tags being looked up - concurrently, divided by the number of cycles where the - vL1D is active [#vl1d-activity]_. - - - Percent +.. jinja:: vl1d-cache-stall-metrics + :file: _templates/metrics_table.j2 .. _vl1d-cache-access-metrics: @@ -510,135 +251,8 @@ the :doc:`L2 cache `. In addition, this section includes the approximate latencies of accesses to the cache itself, along with latencies of read/write memory operations to the :doc:`L2 cache `. -.. list-table:: - :header-rows: 1 - - * - Metric - - - Description - - - Unit - - * - Total Requests - - - The total number of incoming requests from the - :ref:`address processing unit ` after coalescing. - - - Requests - - * - Total read/write/atomic requests - - - The total number of incoming read/write/atomic requests from the - :ref:`address processing unit ` after coalescing per - :ref:`normalization unit ` - - - Requests per :ref:`normalization unit ` - - * - Cache Bandwidth - - - The number of bytes looked up in the vL1D cache as a result of - :ref:`VMEM ` instructions per - :ref:`normalization unit `. The number of bytes is - calculated as the number of cache lines requested multiplied by the cache - line size. This value does not consider partial requests, so for - instance, if only a single value is requested in a cache line, the data - movement will still be counted as a full cache line. - - - Bytes per :ref:`normalization unit ` - - * - Cache Hit Rate [#vl1d-hit]_ - - - The ratio of the number of vL1D cache line requests that hit in vL1D - cache over the total number of cache line requests to the - :ref:`vL1D Cache RAM `. - - - Percent - - * - Cache Accesses - - - The total number of cache line lookups in the vL1D. - - - Cache lines - - * - Cache Hits [#vl1d-hit]_ - - - The number of cache accesses minus the number of outgoing requests to the - :doc:`L2 cache `, that is, the number of cache line requests - serviced by the :ref:`vL1D Cache RAM ` per - :ref:`normalization unit `. - - - Cache lines per :ref:`normalization unit ` - - * - Invalidations - - - The number of times the vL1D was issued a write-back invalidate command - during the kernel's execution per - :ref:`normalization unit `. This may be triggered - by, for instance, the ``buffer_wbinvl1`` instruction. - - - Invalidations per :ref:`normalization unit ` - - * - L1-L2 Bandwidth - - - The number of bytes transferred across the vL1D-L2 interface as a result - of :ref:`VMEM ` instructions, per - :ref:`normalization unit `. The number of bytes is - calculated as the number of cache lines requested multiplied by the cache - line size. This value does not consider partial requests, so for - instance, if only a single value is requested in a cache line, the data - movement will still be counted as a full cache line. - - - Bytes per :ref:`normalization unit ` - - * - L1-L2 Reads - - - The number of read requests for a vL1D cache line that were not satisfied - by the vL1D and must be retrieved from the to the - :doc:`L2 Cache ` per - :ref:`normalization unit `. - - - Requests per :ref:`normalization unit ` - - * - L1-L2 Writes - - - The number of write requests to a vL1D cache line that were sent through - the vL1D to the :doc:`L2 cache `, per - :ref:`normalization unit `. - - - Requests per :ref:`normalization unit ` - - * - L1-L2 Atomics - - - The number of atomic requests that are sent through the vL1D to the - :doc:`L2 cache `, per - :ref:`normalization unit `. This includes requests - for atomics with, and without return. - - - Requests per :ref:`normalization unit ` - - * - L1 Access Latency - - - Calculated as the average number of cycles that a vL1D cache line request - spent in the vL1D cache pipeline. - - - Cycles - - * - L1-L2 Read Access Latency - - - Calculated as the average number of cycles that the vL1D cache took to - issue and receive read requests from the :doc:`L2 Cache `. This - number also includes requests for atomics with return values. - - - Cycles - - * - L1-L2 Write Access Latency - - - Calculated as the average number of cycles that the vL1D cache took to - issue and receive acknowledgement of a write request to the - :doc:`L2 Cache `. This number also includes requests for - atomics without return values. - - - Cycles +.. jinja:: vl1d-cache-access-metrics + :file: _templates/metrics_table.j2 .. note:: @@ -687,80 +301,5 @@ data, and returned to the appropriate SIMD. ROCm Compute Profiler reports the following vL1D data-return path metrics: -.. list-table:: - :header-rows: 1 - - * - Metric - - - Description - - - Unit - - * - Data-return Busy - - - Percent of the :ref:`total CU cycles ` the data-return - unit was busy processing or waiting on data to return to the - :doc:`CU `. - - - Percent - - * - Cache RAM → Data-return Stall - - - Percent of the :ref:`total CU cycles ` the data-return - unit was stalled on data to be returned from the - :ref:`vL1D Cache RAM `. - - - Percent - - * - Workgroup manager → Data-return Stall - - - Percent of the :ref:`total CU cycles ` the data-return - unit was stalled by the :ref:`workgroup manager ` due to - initialization of registers as a part of launching new workgroups. - - - Percent - - * - Coalescable Instructions - - - The number of instructions submitted to the - :ref:`data-return unit ` by the - :ref:`address processor ` that were found to be coalescable, per - :ref:`normalization unit `. - - - Instructions per :ref:`normalization unit ` - - * - Read Instructions - - - The number of read instructions submitted to the - :ref:`data-return unit ` by the - :ref:`address processor ` summed over all - :doc:`compute units ` on the accelerator, per - :ref:`normalization unit `. This is expected to be - the sum of global/generic and spill/stack reads in the - :ref:`address processor `. - - - Instructions per :ref:`normalization unit ` - - * - Write Instructions - - - The number of store instructions submitted to the - :ref:`data-return unit ` by the - :ref:`address processor ` summed over all - :doc:`compute units ` on the accelerator, per - :ref:`normalization unit `. This is expected to be - the sum of global/generic and spill/stack stores counted by the - :ref:`vL1D cache-front-end `. - - - Instructions per :ref:`normalization unit ` - - * - Atomic Instructions - - - The number of atomic instructions submitted to the - :ref:`data-return unit ` by the - :ref:`address processor ` summed over all - :doc:`compute units ` on the accelerator, per - :ref:`normalization unit `. This is expected to be - the sum of global/generic and spill/stack atomics in the - :ref:`address processor `. - - - Instructions per :ref:`normalization unit ` +.. jinja:: desc-td + :file: _templates/metrics_table.j2 diff --git a/projects/rocprofiler-compute/docs/conf.py b/projects/rocprofiler-compute/docs/conf.py index c98dafb60e..aef5591810 100644 --- a/projects/rocprofiler-compute/docs/conf.py +++ b/projects/rocprofiler-compute/docs/conf.py @@ -30,6 +30,8 @@ import re +import yaml + with open("../VERSION", encoding="utf-8") as f: match = re.search(r"([0-9.]+)[^0-9.]+", f.read()) if not match: @@ -43,7 +45,12 @@ copyright = "Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved version = version_number release = version_number -extensions = ["rocm_docs", "sphinx.ext.extlinks", "sphinxcontrib.datatemplates"] +extensions = [ + "rocm_docs", + "sphinx.ext.extlinks", + "sphinxcontrib.datatemplates", + "sphinx_jinja", +] html_theme = "rocm_docs_theme" html_theme_options = {"flavor": "rocm"} html_title = f"{project} {version_number} documentation" @@ -52,6 +59,113 @@ exclude_patterns = ["archive", "*/includes"] html_static_path = ["sphinx/static/css"] html_css_files = ["o_custom.css"] +with open("data/metrics_description.yaml", "r") as f: + metrics_data = yaml.safe_load(f) +jinja_contexts = { + "wavefront-launch-stats": { + "data": metrics_data["Wavefront launch stats"], + }, + "wavefront-runtime-stats": { + "data": metrics_data["Wavefront runtime stats"], + }, + "instruction-mix": { + "data": metrics_data["Overall instruction mix"], + }, + "valu-arith-instruction-mix": { + "data": metrics_data["VALU arithmetic instruction mix"], + }, + "mfma-instruction-mix": { + "data": metrics_data["MFMA instruction mix"], + }, + "compute-speed-of-light": { + "data": metrics_data["Compute Speed-of-Light"], + }, + "pipeline-stats": { + "data": metrics_data["Pipeline statistics"], + }, + "arithmetic-operations": { + "data": metrics_data["Arithmetic operations"], + }, + "lds-sol": { + "data": metrics_data["LDS Speed-of-Light"], + }, + "lds-stats": { + "data": metrics_data["LDS Statistics"], + }, + "vl1d-sol": { + "data": metrics_data["vL1D Speed-of-Light"], + }, + "ta-busy-stall": { + "data": metrics_data["Busy / stall metrics"], + }, + "ta-instruction-counts": { + "data": metrics_data["Instruction counts"], + }, + "ta-spill-stack": { + "data": metrics_data["Spill / stack metrics"], + }, + "desc-utcl1": { + "data": metrics_data["L1 Unified Translation Cache (UTCL1)"], + }, + "vl1d-cache-stall-metrics": { + "data": metrics_data["vL1D cache stall metrics"], + }, + "vl1d-cache-access-metrics": { + "data": metrics_data["vL1D cache access metrics"], + }, + "desc-td": { + "data": metrics_data["Vector L1 data-return path or Texture Data (TD)"], + }, + "l2-sol": { + "data": metrics_data["L2 Speed-of-Light"], + }, + "l2-cache-accesses": { + "data": metrics_data["L2 cache accesses"], + }, + "l2-fabric-metrics": { + "data": metrics_data["L2-Fabric interface metrics"], + }, + "l2-detailed-metrics": { + "data": metrics_data["L2 - Fabric interface detailed metrics"], + }, + "l2-fabric-stalls": { + "data": metrics_data["L2 - Fabric Interface stalls"], + }, + "desc-sl1d-sol": { + "data": metrics_data["Scalar L1D Speed-of-Light"], + }, + "desc-sl1d-stats": { + "data": metrics_data["Scalar L1D cache accesses"], + }, + "desc-sl1d-l2-interface": { + "data": metrics_data["Scalar L1D Cache - L2 Interface"], + }, + "desc-l1i-sol": { + "data": metrics_data["L1I Speed-of-Light"], + }, + "desc-l1i-stats": { + "data": metrics_data["L1I cache accesses"], + }, + "desc-l1i-l2-interface": { + "data": metrics_data["L1I <-> L2 interface"], + }, + "spi-util": { + "data": metrics_data["Workgroup manager utilizations"], + }, + "spi-resc-util": { + "data": metrics_data["Workgroup Manager - Resource Allocation"], + }, + "cpf-metrics": { + "data": metrics_data["Command processor fetcher (CPF)"], + }, + "cpc-metrics": { + "data": metrics_data["Command processor packet processor (CPC)"], + }, + "sys-sol": { + "data": metrics_data["System Speed-of-Light"], + }, +} + external_toc_path = "./sphinx/_toc.yml" external_projects_current_project = "rocprofiler-compute" @@ -96,3 +210,6 @@ extlinks = { "HSA Runtime Programmer's Reference Manual (page %s)", ), } + +# Uncomment if facing rate limit exceed issue with local build +external_projects_remote_repository = "" \ No newline at end of file diff --git a/projects/rocprofiler-compute/docs/data/metrics_description.yaml b/projects/rocprofiler-compute/docs/data/metrics_description.yaml new file mode 100644 index 0000000000..7184df52e1 --- /dev/null +++ b/projects/rocprofiler-compute/docs/data/metrics_description.yaml @@ -0,0 +1,4914 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Wavefront launch stats: + Grid Size: + rst: The total number of work-items (or, threads) launched as a part of the kernel + dispatch. In HIP, this is equivalent to the total grid size multiplied by the + total workgroup (or, block) size. + unit: Work-Items + Workgroup Size: + rst: The total number of work-items (or, threads) in each workgroup (or, block) + launched as part of the kernel dispatch. In HIP, this is equivalent to the total + block size. + unit: Work-Items + Total Wavefronts: + rst: "The total number of wavefronts launched as part of the kernel dispatch.\ + \ On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront\ + \ size is always 64 work-items. Thus, the total number of wavefronts should\ + \ be equivalent to the ceiling of grid size divided by 64." + unit: Wavefronts + Saved Wavefronts: + rst: The total number of wavefronts saved at a context-save. See `cwsr_enable + `_. + unit: Wavefronts + Restored Wavefronts: + rst: The total number of wavefronts restored from a context-save. See `cwsr_enable + `_. + unit: Wavefronts + VGPRs: + rst: 'The number of architected vector general-purpose registers allocated for the + kernel, see :ref:`VALU `. Note: this may not exactly match the + number of VGPRs requested by the compiler due to allocation granularity.' + unit: VGPRs + AGPRs: + rst: 'The number of accumulation vector general-purpose registers allocated for the + kernel, see :ref:`AGPRs `. Note: this may not exactly match the + number of AGPRs requested by the compiler due to allocation granularity.' + unit: AGPRs + SGPRs: + rst: 'The number of scalar general-purpose registers allocated for the kernel, see + :ref:`SALU `. Note: this may not exactly match the number of SGPRs + requested by the compiler due to allocation granularity. plain' + unit: SGPRs + LDS Allocation: + rst: 'The number of bytes of :doc:`LDS ` memory (or, shared memory) + allocated for this kernel. Note: This may also be larger than what was requested + at compile time due to both allocation granularity and dynamic per-dispatch + LDS allocations.' + unit: Bytes per workgroup + Scratch Allocation: + rst: The number of bytes of :ref:`scratch memory ` requested per + work-item for this kernel. Scratch memory is used for stack memory on the accelerator, + as well as for register spills and restores. + unit: Bytes per work-item + Kernel Time: + rst: The total duration of the executed kernel. + unit: Nanoseconds + Kernel Time (Cycles): + rst: The total duration of the executed kernel in cycles. + unit: Cycles + Instructions per wavefront: + rst: The average number of instructions (of all types) executed per wavefront. + This is averaged over all wavefronts in a kernel dispatch. + unit: Instructions per wavefront + Wave Cycles: + rst: 'The number of cycles a wavefront in the kernel dispatch spent resident on a + compute unit per :ref:`normalization unit `. This is averaged + over all wavefronts in a kernel dispatch. Note: this should not be directly + compared to the kernel cycles above.' + unit: Cycles per normalization unit + Dependency Wait Cycles: + rst: The number of cycles a wavefront in the kernel dispatch stalled waiting on + memory of any kind (e.g., instruction fetch, vector or scalar memory, etc.) + per :ref:`normalization unit `. This counter is incremented + at every cycle by *all* wavefronts on a CU stalled at a memory operation. As + such, it is most useful to get a sense of how waves were spending their time, + rather than identification of a precise limiter because another wave could + be actively executing while a wave is stalled. The sum of this metric, Issue + Wait Cycles and Active Cycles should be equal to the total Wave Cycles metric. + unit: Cycles per normalization unit + Issue Wait Cycles: + rst: The number of cycles a wavefront in the kernel dispatch was unable to issue + an instruction for any reason (e.g., execution pipe back-pressure, arbitration + loss, etc.) per :ref:`normalization unit `. This counter + is incremented at every cycle by *all* wavefronts on a CU unable to issue an instruction. As + such, it is most useful to get a sense of how waves were spending their time, + rather than identification of a precise limiter because another wave could + be actively executing while a wave is issue stalled. The sum of this metric, + Dependency Wait Cycles and Active Cycles should be equal to the total Wave + Cycles metric. + unit: Cycles per normalization unit + Active Cycles: + rst: The average number of cycles a wavefront in the kernel dispatch was actively + executing instructions per :ref:`normalization unit `. + This measurement is made on a per-wavefront basis, and may include cycles that + another wavefront spent actively executing (on another execution unit, for + example) or was stalled. As such, it is most useful to get a sense of how + waves were spending their time, rather than identification of a precise limiter. + The sum of this metric, Issue Wait Cycles and Active Wait Cycles should be equal + to the total Wave Cycles metric. + unit: Cycles per normalization unit + Wavefront Occupancy: + rst: 'The time-averaged number of wavefronts resident on the accelerator over the + lifetime of the kernel. Note: this metric may be inaccurate for short-running + kernels (less than 1ms).' + unit: Wavefronts +Wavefront runtime stats: + Grid Size: + rst: The total number of work-items (or, threads) launched as a part of the kernel + dispatch. In HIP, this is equivalent to the total grid size multiplied by the + total workgroup (or, block) size. + unit: Work-Items + Workgroup Size: + rst: The total number of work-items (or, threads) in each workgroup (or, block) + launched as part of the kernel dispatch. In HIP, this is equivalent to the total + block size. + unit: Work-Items + Total Wavefronts: + rst: "The total number of wavefronts launched as part of the kernel dispatch.\ + \ On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront\ + \ size is always 64 work-items. Thus, the total number of wavefronts should\ + \ be equivalent to the ceiling of grid size divided by 64." + unit: Wavefronts + Saved Wavefronts: + rst: The total number of wavefronts saved at a context-save. See `cwsr_enable + `_. + unit: Wavefronts + Restored Wavefronts: + rst: The total number of wavefronts restored from a context-save. See `cwsr_enable + `_. + unit: Wavefronts + VGPRs: + rst: 'The number of architected vector general-purpose registers allocated for the + kernel, see :ref:`VALU `. Note: this may not exactly match the + number of VGPRs requested by the compiler due to allocation granularity.' + unit: VGPRs + AGPRs: + rst: 'The number of accumulation vector general-purpose registers allocated for the + kernel, see :ref:`AGPRs `. Note: this may not exactly match the + number of AGPRs requested by the compiler due to allocation granularity.' + unit: AGPRs + SGPRs: + rst: 'The number of scalar general-purpose registers allocated for the kernel, see + :ref:`SALU `. Note: this may not exactly match the number of SGPRs + requested by the compiler due to allocation granularity. plain' + unit: SGPRs + LDS Allocation: + rst: 'The number of bytes of :doc:`LDS ` memory (or, shared memory) + allocated for this kernel. Note: This may also be larger than what was requested + at compile time due to both allocation granularity and dynamic per-dispatch + LDS allocations.' + unit: Bytes per workgroup + Scratch Allocation: + rst: The number of bytes of :ref:`scratch memory ` requested per + work-item for this kernel. Scratch memory is used for stack memory on the accelerator, + as well as for register spills and restores. + unit: Bytes per work-item + Kernel Time: + rst: The total duration of the executed kernel. + unit: Nanoseconds + Kernel Time (Cycles): + rst: The total duration of the executed kernel in cycles. + unit: Cycles + Instructions per wavefront: + rst: The average number of instructions (of all types) executed per wavefront. + This is averaged over all wavefronts in a kernel dispatch. + unit: Instructions per wavefront + Wave Cycles: + rst: 'The number of cycles a wavefront in the kernel dispatch spent resident on a + compute unit per :ref:`normalization unit `. This is averaged + over all wavefronts in a kernel dispatch. Note: this should not be directly + compared to the kernel cycles above.' + unit: Cycles per normalization unit + Dependency Wait Cycles: + rst: The number of cycles a wavefront in the kernel dispatch stalled waiting on + memory of any kind (e.g., instruction fetch, vector or scalar memory, etc.) + per :ref:`normalization unit `. This counter is incremented + at every cycle by *all* wavefronts on a CU stalled at a memory operation. As + such, it is most useful to get a sense of how waves were spending their time, + rather than identification of a precise limiter because another wave could + be actively executing while a wave is stalled. The sum of this metric, Issue + Wait Cycles and Active Cycles should be equal to the total Wave Cycles metric. + unit: Cycles per normalization unit + Issue Wait Cycles: + rst: The number of cycles a wavefront in the kernel dispatch was unable to issue + an instruction for any reason (e.g., execution pipe back-pressure, arbitration + loss, etc.) per :ref:`normalization unit `. This counter + is incremented at every cycle by *all* wavefronts on a CU unable to issue an instruction. As + such, it is most useful to get a sense of how waves were spending their time, + rather than identification of a precise limiter because another wave could + be actively executing while a wave is issue stalled. The sum of this metric, + Dependency Wait Cycles and Active Cycles should be equal to the total Wave + Cycles metric. + unit: Cycles per normalization unit + Active Cycles: + rst: The average number of cycles a wavefront in the kernel dispatch was actively + executing instructions per :ref:`normalization unit `. + This measurement is made on a per-wavefront basis, and may include cycles that + another wavefront spent actively executing (on another execution unit, for + example) or was stalled. As such, it is most useful to get a sense of how + waves were spending their time, rather than identification of a precise limiter. + The sum of this metric, Issue Wait Cycles and Active Wait Cycles should be equal + to the total Wave Cycles metric. + unit: Cycles per normalization unit + Wavefront Occupancy: + rst: 'The time-averaged number of wavefronts resident on the accelerator over the + lifetime of the kernel. Note: this metric may be inaccurate for short-running + kernels (less than 1ms).' + unit: Wavefronts +Overall instruction mix: + VALU: + rst: The total number of vector arithmetic logic unit (VALU) operations issued. + These are the workhorses of the :doc:`compute unit `, and are + used to execute a wide range of instruction types including floating point + operations, non-uniform address calculations, transcendental operations, integer + operations, shifts, conditional evaluation, etc. + unit: Instructions + VMEM: + rst: The total number of vector memory operations issued. These include most loads, + stores and atomic operations and all accesses to :ref:`generic, global, private + and texture ` memory. + unit: Instructions + LDS: + rst: The total number of LDS (also known as shared memory) operations issued. These + include loads, stores, atomics, and HIP's ``__shfl`` operations. + unit: Instructions + MFMA: + rst: The total number of matrix fused multiply-add instructions issued. + unit: Instructions + SALU: + rst: The total number of scalar arithmetic logic unit (SALU) operations issued. + Typically these are used for address calculations, literal constants, and other + operations that are provably uniform across a wavefront. Although scalar memory + (SMEM) operations are issued by the SALU, they are counted separately in this + section. + unit: Instructions + SMEM: + rst: The total number of scalar memory (SMEM) operations issued. These are typically + used for loading kernel arguments, base-pointers and loads from HIP's ``__constant__`` + memory. + unit: Instructions + Branch: + rst: The total number of branch operations issued. These typically consist of jump + or branch operations and are used to implement control flow. + unit: Instructions + INT32: + rst: The total number of instructions operating on 32-bit integer operands issued + to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + INT64: + rst: The total number of instructions operating on 64-bit integer operands issued + to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F16-ADD: + rst: The total number of addition instructions operating on 16-bit floating-point + operands issued to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F16-MUL: + rst: The total number of multiplication instructions operating on 16-bit floating-point + operands issued to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F16-FMA: + rst: The total number of fused multiply-add instructions operating on 16-bit floating-point + operands issued to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F16-Trans: + rst: The total number of transcendental instructions (e.g., `sqrt`) operating on + 16-bit floating-point operands issued to the VALU per :ref:`normalization unit + `. + unit: Instructions per normalization unit + F32-ADD: + rst: The total number of addition instructions operating on 32-bit floating-point + operands issued to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F32-MUL: + rst: The total number of multiplication instructions operating on 32-bit floating-point + operands issued to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F32-FMA: + rst: The total number of fused multiply-add instructions operating on 32-bit floating-point + operands issued to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F32-Trans: + rst: The total number of transcendental instructions (such as ``sqrt``) operating + on 32-bit floating-point operands issued to the VALU per :ref:`normalization + unit `. + unit: Instructions per normalization unit + F64-ADD: + rst: The total number of addition instructions operating on 64-bit floating-point + operands issued to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F64-MUL: + rst: The total number of multiplication instructions operating on 64-bit floating-point + operands issued to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F64-FMA: + rst: The total number of fused multiply-add instructions operating on 64-bit floating-point + operands issued to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F64-Trans: + rst: The total number of transcendental instructions (such as `sqrt`) operating + on 64-bit floating-point operands issued to the VALU per :ref:`normalization + unit `. + unit: Instructions per normalization unit + Conversion: + rst: "The total number of type conversion instructions (such as converting data\ + \ to or from F32\u2194F64) issued to the VALU per :ref:`normalization unit\ + \ `." + unit: Instructions per normalization unit + Global/Generic Instr: + rst: The total number of global & generic memory instructions executed on all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + unit: Instructions per normalization unit + Global/Generic Read: + rst: The total number of global & generic memory read instructions executed on all + :doc:`compute units ` on the accelerator, per :ref:`normalization + unit `. + unit: Instructions per normalization unit + Global/Generic Write: + rst: The total number of global & generic memory write instructions executed on + all :doc:`compute units ` on the accelerator, per :ref:`normalization + unit `. + unit: Instructions per normalization unit + Global/Generic Atomic: + rst: The total number of global & generic memory atomic (with and without return) + instructions executed on all :doc:`compute units ` on the accelerator, + per :ref:`normalization unit `. + unit: Instructions per normalization unit + Spill/Stack Instr: + rst: The total number of spill/stack memory instructions executed on all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + unit: Instructions per normalization unit + Spill/Stack Read: + rst: The total number of spill/stack memory read instructions executed on all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + unit: Instructions per normalization unit + Spill/Stack Write: + rst: The total number of spill/stack memory write instructions executed on all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + unit: Instructions per normalization unit + Spill/Stack Atomic: + rst: The total number of spill/stack memory atomic (with and without return) instructions + executed on all :doc:`compute units ` on the accelerator, per + :ref:`normalization unit `. Typically unused as these + memory operations are typically used to implement thread-local storage. + unit: Instructions per normalization unit + MFMA-I8: + rst: The total number of 8-bit integer :ref:`MFMA ` instructions issued + per :ref:`normalization unit `. + unit: Instructions per normalization unit + MFMA-F8: + rst: The total number of 8-bit floating point :ref:`MFMA ` instructions issued + per :ref:`normalization unit `. This is supported in AMD + Instinct MI300 series and later only. + unit: Instructions per normalization unit + MFMA-F16: + rst: The total number of 16-bit floating point :ref:`MFMA ` instructions + issued per :ref:`normalization unit `. + unit: Instructions per normalization unit + MFMA-BF16: + rst: The total number of 16-bit brain floating point :ref:`MFMA ` instructions + issued per :ref:`normalization unit `. + unit: Instructions per normalization unit + MFMA-F32: + rst: The total number of 32-bit floating-point :ref:`MFMA ` instructions + issued per :ref:`normalization unit `. + unit: Instructions per normalization unit + MFMA-F64: + rst: The total number of 64-bit floating-point :ref:`MFMA ` instructions + issued per :ref:`normalization unit `. + unit: Instructions per normalization unit +VALU arithmetic instruction mix: + VALU: + rst: The total number of vector arithmetic logic unit (VALU) operations issued. + These are the workhorses of the :doc:`compute unit `, and are + used to execute a wide range of instruction types including floating point + operations, non-uniform address calculations, transcendental operations, integer + operations, shifts, conditional evaluation, etc. + unit: Instructions + VMEM: + rst: The total number of vector memory operations issued. These include most loads, + stores and atomic operations and all accesses to :ref:`generic, global, private + and texture ` memory. + unit: Instructions + LDS: + rst: The total number of LDS (also known as shared memory) operations issued. These + include loads, stores, atomics, and HIP's ``__shfl`` operations. + unit: Instructions + MFMA: + rst: The total number of matrix fused multiply-add instructions issued. + unit: Instructions + SALU: + rst: The total number of scalar arithmetic logic unit (SALU) operations issued. + Typically these are used for address calculations, literal constants, and other + operations that are provably uniform across a wavefront. Although scalar memory + (SMEM) operations are issued by the SALU, they are counted separately in this + section. + unit: Instructions + SMEM: + rst: The total number of scalar memory (SMEM) operations issued. These are typically + used for loading kernel arguments, base-pointers and loads from HIP's ``__constant__`` + memory. + unit: Instructions + Branch: + rst: The total number of branch operations issued. These typically consist of jump + or branch operations and are used to implement control flow. + unit: Instructions + INT32: + rst: The total number of instructions operating on 32-bit integer operands issued + to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + INT64: + rst: The total number of instructions operating on 64-bit integer operands issued + to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F16-ADD: + rst: The total number of addition instructions operating on 16-bit floating-point + operands issued to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F16-MUL: + rst: The total number of multiplication instructions operating on 16-bit floating-point + operands issued to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F16-FMA: + rst: The total number of fused multiply-add instructions operating on 16-bit floating-point + operands issued to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F16-Trans: + rst: The total number of transcendental instructions (e.g., `sqrt`) operating on + 16-bit floating-point operands issued to the VALU per :ref:`normalization unit + `. + unit: Instructions per normalization unit + F32-ADD: + rst: The total number of addition instructions operating on 32-bit floating-point + operands issued to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F32-MUL: + rst: The total number of multiplication instructions operating on 32-bit floating-point + operands issued to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F32-FMA: + rst: The total number of fused multiply-add instructions operating on 32-bit floating-point + operands issued to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F32-Trans: + rst: The total number of transcendental instructions (such as ``sqrt``) operating + on 32-bit floating-point operands issued to the VALU per :ref:`normalization + unit `. + unit: Instructions per normalization unit + F64-ADD: + rst: The total number of addition instructions operating on 64-bit floating-point + operands issued to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F64-MUL: + rst: The total number of multiplication instructions operating on 64-bit floating-point + operands issued to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F64-FMA: + rst: The total number of fused multiply-add instructions operating on 64-bit floating-point + operands issued to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F64-Trans: + rst: The total number of transcendental instructions (such as `sqrt`) operating + on 64-bit floating-point operands issued to the VALU per :ref:`normalization + unit `. + unit: Instructions per normalization unit + Conversion: + rst: "The total number of type conversion instructions (such as converting data\ + \ to or from F32\u2194F64) issued to the VALU per :ref:`normalization unit\ + \ `." + unit: Instructions per normalization unit + Global/Generic Instr: + rst: The total number of global & generic memory instructions executed on all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + unit: Instructions per normalization unit + Global/Generic Read: + rst: The total number of global & generic memory read instructions executed on all + :doc:`compute units ` on the accelerator, per :ref:`normalization + unit `. + unit: Instructions per normalization unit + Global/Generic Write: + rst: The total number of global & generic memory write instructions executed on + all :doc:`compute units ` on the accelerator, per :ref:`normalization + unit `. + unit: Instructions per normalization unit + Global/Generic Atomic: + rst: The total number of global & generic memory atomic (with and without return) + instructions executed on all :doc:`compute units ` on the accelerator, + per :ref:`normalization unit `. + unit: Instructions per normalization unit + Spill/Stack Instr: + rst: The total number of spill/stack memory instructions executed on all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + unit: Instructions per normalization unit + Spill/Stack Read: + rst: The total number of spill/stack memory read instructions executed on all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + unit: Instructions per normalization unit + Spill/Stack Write: + rst: The total number of spill/stack memory write instructions executed on all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + unit: Instructions per normalization unit + Spill/Stack Atomic: + rst: The total number of spill/stack memory atomic (with and without return) instructions + executed on all :doc:`compute units ` on the accelerator, per + :ref:`normalization unit `. Typically unused as these + memory operations are typically used to implement thread-local storage. + unit: Instructions per normalization unit + MFMA-I8: + rst: The total number of 8-bit integer :ref:`MFMA ` instructions issued + per :ref:`normalization unit `. + unit: Instructions per normalization unit + MFMA-F8: + rst: The total number of 8-bit floating point :ref:`MFMA ` instructions issued + per :ref:`normalization unit `. This is supported in AMD + Instinct MI300 series and later only. + unit: Instructions per normalization unit + MFMA-F16: + rst: The total number of 16-bit floating point :ref:`MFMA ` instructions + issued per :ref:`normalization unit `. + unit: Instructions per normalization unit + MFMA-BF16: + rst: The total number of 16-bit brain floating point :ref:`MFMA ` instructions + issued per :ref:`normalization unit `. + unit: Instructions per normalization unit + MFMA-F32: + rst: The total number of 32-bit floating-point :ref:`MFMA ` instructions + issued per :ref:`normalization unit `. + unit: Instructions per normalization unit + MFMA-F64: + rst: The total number of 64-bit floating-point :ref:`MFMA ` instructions + issued per :ref:`normalization unit `. + unit: Instructions per normalization unit +MFMA instruction mix: + VALU: + rst: The total number of vector arithmetic logic unit (VALU) operations issued. + These are the workhorses of the :doc:`compute unit `, and are + used to execute a wide range of instruction types including floating point + operations, non-uniform address calculations, transcendental operations, integer + operations, shifts, conditional evaluation, etc. + unit: Instructions + VMEM: + rst: The total number of vector memory operations issued. These include most loads, + stores and atomic operations and all accesses to :ref:`generic, global, private + and texture ` memory. + unit: Instructions + LDS: + rst: The total number of LDS (also known as shared memory) operations issued. These + include loads, stores, atomics, and HIP's ``__shfl`` operations. + unit: Instructions + MFMA: + rst: The total number of matrix fused multiply-add instructions issued. + unit: Instructions + SALU: + rst: The total number of scalar arithmetic logic unit (SALU) operations issued. + Typically these are used for address calculations, literal constants, and other + operations that are provably uniform across a wavefront. Although scalar memory + (SMEM) operations are issued by the SALU, they are counted separately in this + section. + unit: Instructions + SMEM: + rst: The total number of scalar memory (SMEM) operations issued. These are typically + used for loading kernel arguments, base-pointers and loads from HIP's ``__constant__`` + memory. + unit: Instructions + Branch: + rst: The total number of branch operations issued. These typically consist of jump + or branch operations and are used to implement control flow. + unit: Instructions + INT32: + rst: The total number of instructions operating on 32-bit integer operands issued + to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + INT64: + rst: The total number of instructions operating on 64-bit integer operands issued + to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F16-ADD: + rst: The total number of addition instructions operating on 16-bit floating-point + operands issued to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F16-MUL: + rst: The total number of multiplication instructions operating on 16-bit floating-point + operands issued to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F16-FMA: + rst: The total number of fused multiply-add instructions operating on 16-bit floating-point + operands issued to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F16-Trans: + rst: The total number of transcendental instructions (e.g., `sqrt`) operating on + 16-bit floating-point operands issued to the VALU per :ref:`normalization unit + `. + unit: Instructions per normalization unit + F32-ADD: + rst: The total number of addition instructions operating on 32-bit floating-point + operands issued to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F32-MUL: + rst: The total number of multiplication instructions operating on 32-bit floating-point + operands issued to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F32-FMA: + rst: The total number of fused multiply-add instructions operating on 32-bit floating-point + operands issued to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F32-Trans: + rst: The total number of transcendental instructions (such as ``sqrt``) operating + on 32-bit floating-point operands issued to the VALU per :ref:`normalization + unit `. + unit: Instructions per normalization unit + F64-ADD: + rst: The total number of addition instructions operating on 64-bit floating-point + operands issued to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F64-MUL: + rst: The total number of multiplication instructions operating on 64-bit floating-point + operands issued to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F64-FMA: + rst: The total number of fused multiply-add instructions operating on 64-bit floating-point + operands issued to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F64-Trans: + rst: The total number of transcendental instructions (such as `sqrt`) operating + on 64-bit floating-point operands issued to the VALU per :ref:`normalization + unit `. + unit: Instructions per normalization unit + Conversion: + rst: "The total number of type conversion instructions (such as converting data\ + \ to or from F32\u2194F64) issued to the VALU per :ref:`normalization unit\ + \ `." + unit: Instructions per normalization unit + Global/Generic Instr: + rst: The total number of global & generic memory instructions executed on all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + unit: Instructions per normalization unit + Global/Generic Read: + rst: The total number of global & generic memory read instructions executed on all + :doc:`compute units ` on the accelerator, per :ref:`normalization + unit `. + unit: Instructions per normalization unit + Global/Generic Write: + rst: The total number of global & generic memory write instructions executed on + all :doc:`compute units ` on the accelerator, per :ref:`normalization + unit `. + unit: Instructions per normalization unit + Global/Generic Atomic: + rst: The total number of global & generic memory atomic (with and without return) + instructions executed on all :doc:`compute units ` on the accelerator, + per :ref:`normalization unit `. + unit: Instructions per normalization unit + Spill/Stack Instr: + rst: The total number of spill/stack memory instructions executed on all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + unit: Instructions per normalization unit + Spill/Stack Read: + rst: The total number of spill/stack memory read instructions executed on all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + unit: Instructions per normalization unit + Spill/Stack Write: + rst: The total number of spill/stack memory write instructions executed on all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + unit: Instructions per normalization unit + Spill/Stack Atomic: + rst: The total number of spill/stack memory atomic (with and without return) instructions + executed on all :doc:`compute units ` on the accelerator, per + :ref:`normalization unit `. Typically unused as these + memory operations are typically used to implement thread-local storage. + unit: Instructions per normalization unit + MFMA-I8: + rst: The total number of 8-bit integer :ref:`MFMA ` instructions issued + per :ref:`normalization unit `. + unit: Instructions per normalization unit + MFMA-F8: + rst: The total number of 8-bit floating point :ref:`MFMA ` instructions issued + per :ref:`normalization unit `. This is supported in AMD + Instinct MI300 series and later only. + unit: Instructions per normalization unit + MFMA-F16: + rst: The total number of 16-bit floating point :ref:`MFMA ` instructions + issued per :ref:`normalization unit `. + unit: Instructions per normalization unit + MFMA-BF16: + rst: The total number of 16-bit brain floating point :ref:`MFMA ` instructions + issued per :ref:`normalization unit `. + unit: Instructions per normalization unit + MFMA-F32: + rst: The total number of 32-bit floating-point :ref:`MFMA ` instructions + issued per :ref:`normalization unit `. + unit: Instructions per normalization unit + MFMA-F64: + rst: The total number of 64-bit floating-point :ref:`MFMA ` instructions + issued per :ref:`normalization unit `. + unit: Instructions per normalization unit +Compute Speed-of-Light: + VALU FLOPs: + rst: 'The total floating-point operations executed per second on the :ref:`VALU + `. This is also presented as a percent of the peak theoretical FLOPs + achievable on the specific accelerator. Note: this does not include any floating-point + operations from :ref:`MFMA ` instructions.' + unit: GFLOPs + VALU IOPs: + rst: 'The total integer operations executed per second on the :ref:`VALU `. + This is also presented as a percent of the peak theoretical IOPs achievable + on the specific accelerator. Note: this does not include any integer operations + from :ref:`MFMA ` instructions.' + unit: GIOPs + MFMA FLOPs (BF16): + rst: 'The total number of 16-bit brain floating point :ref:`MFMA ` operations + executed per second. Note: this does not include any 16-bit brain floating + point operations from :ref:`VALU ` instructions. This is also presented + as a percent of the peak theoretical BF16 MFMA operations achievable on the + specific accelerator.' + unit: GFLOPs + MFMA FLOPs (F16): + rst: 'The total number of 16-bit floating point :ref:`MFMA ` operations + executed per second. Note: this does not include any 16-bit floating point + operations from :ref:`VALU ` instructions. This is also presented + as a percent of the peak theoretical F16 MFMA operations achievable on the + specific accelerator.' + unit: GFLOPs + MFMA FLOPs (F32): + rst: 'The total number of 32-bit floating point :ref:`MFMA ` operations + executed per second. Note: this does not include any 32-bit floating point + operations from :ref:`VALU ` instructions. This is also presented + as a percent of the peak theoretical F32 MFMA operations achievable on the + specific accelerator.' + unit: GFLOPs + MFMA FLOPs (F64): + rst: 'The total number of 64-bit floating point :ref:`MFMA ` operations + executed per second. Note: this does not include any 64-bit floating point + operations from :ref:`VALU ` instructions. This is also presented + as a percent of the peak theoretical F64 MFMA operations achievable on the + specific accelerator. The total number of 64-bit floating point :ref:`MFMA + ` operations executed per second. Note: this does not include any + 64-bit floating point operations from :ref:`VALU ` instructions. + This is also presented as a percent of the peak theoretical F64 MFMA operations + achievable on the specific accelerator.' + unit: GFLOPs + MFMA IOPs (INT8): + rst: 'The total number of 8-bit integer :ref:`MFMA ` operations executed + per second. Note: this does not include any 8-bit integer operations from :ref:`VALU + ` instructions. This is also presented as a percent of the peak + theoretical INT8 MFMA operations achievable on the specific accelerator.' + unit: GFLOPs + IPC: + rst: The ratio of the total number of instructions executed on the :doc:`CU ` + over the :ref:`total active CU cycles `. + unit: Instructions per cycle + IPC (Issued): + rst: The ratio of the total number of (non-:ref:`internal `) + instructions issued over the number of cycles where the :ref:`scheduler ` + was actively working on issuing instructions. Refer to the :ref:`Issued IPC + ` example for further detail. + unit: Instructions per cycle + SALU Utilization: + rst: Indicates what percent of the kernel's duration the :ref:`SALU ` + was busy executing instructions. Computed as the ratio of the total number + of cycles spent by the :ref:`scheduler ` issuing SALU / :ref:`SMEM + ` instructions over the :ref:`total CU cycles `. + unit: Percent + VALU Utilization: + rst: Indicates what percent of the kernel's duration the :ref:`VALU ` + was busy executing instructions. Does not include :ref:`VMEM ` operations. + Computed as the ratio of the total number of cycles spent by the :ref:`scheduler + ` issuing VALU instructions over the :ref:`total CU cycles + `. + unit: Percent + VMEM Utilization: + rst: Indicates what percent of the kernel's duration the :ref:`VMEM ` + unit was busy executing instructions, including both global/generic and spill/scratch + operations (see the :ref:`VMEM instruction count metrics ` + for more detail). Does not include :ref:`VALU ` operations. Computed as + the ratio of the total number of cycles spent by the :ref:`scheduler ` + issuing VMEM instructions over the :ref:`total CU cycles `. + unit: Percent + Branch Utilization: + rst: Indicates what percent of the kernel's duration the :ref:`branch ` + unit was busy executing instructions. Computed as the ratio of the total number + of cycles spent by the :ref:`scheduler ` issuing branch instructions + over the :ref:`total CU cycles `. + unit: Percent + VALU Active Threads: + rst: Indicates the average level of :ref:`divergence ` within a + wavefront over the lifetime of the kernel. The number of work-items that were + active in a wavefront during execution of each :ref:`VALU ` instruction, + time-averaged over all VALU instructions run on all wavefronts in the kernel. + unit: Work-items + MFMA Utilization: + rst: Indicates what percent of the kernel's duration the :ref:`MFMA ` + unit was busy executing instructions. Computed as the ratio of the total number + of cycles spent by the :ref:`MFMA ` was busy over the :ref:`total + CU cycles `. + unit: Percent + MFMA Instruction Cycles: + rst: The average duration of :ref:`MFMA ` instructions in this kernel + in cycles. Computed as the ratio of the total number of cycles the MFMA unit + was busy over the total number of MFMA instructions. Compare to, for example, + the `AMD Matrix Instruction Calculator `_. + unit: Cycles per instruction + VMEM Latency: + rst: The average number of round-trip cycles (that is, from issue to data return + / acknowledgment) required for a VMEM instruction to complete. + unit: Cycles + SMEM Latency: + rst: The average number of round-trip cycles (that is, from issue to data return + / acknowledgment) required for a SMEM instruction to complete. + unit: Cycles + FLOPs (Total): + rst: The total number of floating-point operations executed on either the :ref:`VALU + ` or :ref:`MFMA ` units, per :ref:`normalization unit + `. + unit: FLOP per normalization unit + IOPs (Total): + rst: The total number of integer operations executed on either the :ref:`VALU + ` or :ref:`MFMA ` units, per :ref:`normalization unit + `. + unit: IOP per normalization unit + F16 OPs: + rst: The total number of 16-bit floating-point operations executed on either the :ref:`VALU + ` or :ref:`MFMA ` units, per :ref:`normalization unit + `. + unit: FLOP per normalization unit + BF16 OPs: + rst: 'The total number of 16-bit brain floating-point operations executed on either + the :ref:`VALU ` or :ref:`MFMA ` units, per :ref:`normalization + unit `. Note: on current CDNA accelerators, the VALU has + no native BF16 instructions.' + unit: FLOP per normalization unit + F32 OPs: + rst: The total number of 32-bit floating-point operations executed on either the + :ref:`VALU ` or :ref:`MFMA ` units, per :ref:`normalization + unit `. + unit: FLOP per normalization unit + F64 OPs: + rst: The total number of 64-bit floating-point operations executed on either the + :ref:`VALU ` or :ref:`MFMA ` units, per :ref:`normalization + unit `. + unit: FLOP per normalization unit + INT8 OPs: + rst: 'The total number of 8-bit integer operations executed on either the :ref:`VALU + ` or :ref:`MFMA ` units, per :ref:`normalization unit + `. Note: on current CDNA accelerators, the VALU has no + native INT8 instructions.' + unit: IOP per normalization unit +Pipeline statistics: + VALU FLOPs: + rst: 'The total floating-point operations executed per second on the :ref:`VALU + `. This is also presented as a percent of the peak theoretical FLOPs + achievable on the specific accelerator. Note: this does not include any floating-point + operations from :ref:`MFMA ` instructions.' + unit: GFLOPs + VALU IOPs: + rst: 'The total integer operations executed per second on the :ref:`VALU `. + This is also presented as a percent of the peak theoretical IOPs achievable + on the specific accelerator. Note: this does not include any integer operations + from :ref:`MFMA ` instructions.' + unit: GIOPs + MFMA FLOPs (BF16): + rst: 'The total number of 16-bit brain floating point :ref:`MFMA ` operations + executed per second. Note: this does not include any 16-bit brain floating + point operations from :ref:`VALU ` instructions. This is also presented + as a percent of the peak theoretical BF16 MFMA operations achievable on the + specific accelerator.' + unit: GFLOPs + MFMA FLOPs (F16): + rst: 'The total number of 16-bit floating point :ref:`MFMA ` operations + executed per second. Note: this does not include any 16-bit floating point + operations from :ref:`VALU ` instructions. This is also presented + as a percent of the peak theoretical F16 MFMA operations achievable on the + specific accelerator.' + unit: GFLOPs + MFMA FLOPs (F32): + rst: 'The total number of 32-bit floating point :ref:`MFMA ` operations + executed per second. Note: this does not include any 32-bit floating point + operations from :ref:`VALU ` instructions. This is also presented + as a percent of the peak theoretical F32 MFMA operations achievable on the + specific accelerator.' + unit: GFLOPs + MFMA FLOPs (F64): + rst: 'The total number of 64-bit floating point :ref:`MFMA ` operations + executed per second. Note: this does not include any 64-bit floating point + operations from :ref:`VALU ` instructions. This is also presented + as a percent of the peak theoretical F64 MFMA operations achievable on the + specific accelerator. The total number of 64-bit floating point :ref:`MFMA + ` operations executed per second. Note: this does not include any + 64-bit floating point operations from :ref:`VALU ` instructions. + This is also presented as a percent of the peak theoretical F64 MFMA operations + achievable on the specific accelerator.' + unit: GFLOPs + MFMA IOPs (INT8): + rst: 'The total number of 8-bit integer :ref:`MFMA ` operations executed + per second. Note: this does not include any 8-bit integer operations from :ref:`VALU + ` instructions. This is also presented as a percent of the peak + theoretical INT8 MFMA operations achievable on the specific accelerator.' + unit: GFLOPs + IPC: + rst: The ratio of the total number of instructions executed on the :doc:`CU ` + over the :ref:`total active CU cycles `. + unit: Instructions per cycle + IPC (Issued): + rst: The ratio of the total number of (non-:ref:`internal `) + instructions issued over the number of cycles where the :ref:`scheduler ` + was actively working on issuing instructions. Refer to the :ref:`Issued IPC + ` example for further detail. + unit: Instructions per cycle + SALU Utilization: + rst: Indicates what percent of the kernel's duration the :ref:`SALU ` + was busy executing instructions. Computed as the ratio of the total number + of cycles spent by the :ref:`scheduler ` issuing SALU / :ref:`SMEM + ` instructions over the :ref:`total CU cycles `. + unit: Percent + VALU Utilization: + rst: Indicates what percent of the kernel's duration the :ref:`VALU ` + was busy executing instructions. Does not include :ref:`VMEM ` operations. + Computed as the ratio of the total number of cycles spent by the :ref:`scheduler + ` issuing VALU instructions over the :ref:`total CU cycles + `. + unit: Percent + VMEM Utilization: + rst: Indicates what percent of the kernel's duration the :ref:`VMEM ` + unit was busy executing instructions, including both global/generic and spill/scratch + operations (see the :ref:`VMEM instruction count metrics ` + for more detail). Does not include :ref:`VALU ` operations. Computed as + the ratio of the total number of cycles spent by the :ref:`scheduler ` + issuing VMEM instructions over the :ref:`total CU cycles `. + unit: Percent + Branch Utilization: + rst: Indicates what percent of the kernel's duration the :ref:`branch ` + unit was busy executing instructions. Computed as the ratio of the total number + of cycles spent by the :ref:`scheduler ` issuing branch instructions + over the :ref:`total CU cycles `. + unit: Percent + VALU Active Threads: + rst: Indicates the average level of :ref:`divergence ` within a + wavefront over the lifetime of the kernel. The number of work-items that were + active in a wavefront during execution of each :ref:`VALU ` instruction, + time-averaged over all VALU instructions run on all wavefronts in the kernel. + unit: Work-items + MFMA Utilization: + rst: Indicates what percent of the kernel's duration the :ref:`MFMA ` + unit was busy executing instructions. Computed as the ratio of the total number + of cycles spent by the :ref:`MFMA ` was busy over the :ref:`total + CU cycles `. + unit: Percent + MFMA Instruction Cycles: + rst: The average duration of :ref:`MFMA ` instructions in this kernel + in cycles. Computed as the ratio of the total number of cycles the MFMA unit + was busy over the total number of MFMA instructions. Compare to, for example, + the `AMD Matrix Instruction Calculator `_. + unit: Cycles per instruction + VMEM Latency: + rst: The average number of round-trip cycles (that is, from issue to data return + / acknowledgment) required for a VMEM instruction to complete. + unit: Cycles + SMEM Latency: + rst: The average number of round-trip cycles (that is, from issue to data return + / acknowledgment) required for a SMEM instruction to complete. + unit: Cycles + FLOPs (Total): + rst: The total number of floating-point operations executed on either the :ref:`VALU + ` or :ref:`MFMA ` units, per :ref:`normalization unit + `. + unit: FLOP per normalization unit + IOPs (Total): + rst: The total number of integer operations executed on either the :ref:`VALU + ` or :ref:`MFMA ` units, per :ref:`normalization unit + `. + unit: IOP per normalization unit + F16 OPs: + rst: The total number of 16-bit floating-point operations executed on either the :ref:`VALU + ` or :ref:`MFMA ` units, per :ref:`normalization unit + `. + unit: FLOP per normalization unit + BF16 OPs: + rst: 'The total number of 16-bit brain floating-point operations executed on either + the :ref:`VALU ` or :ref:`MFMA ` units, per :ref:`normalization + unit `. Note: on current CDNA accelerators, the VALU has + no native BF16 instructions.' + unit: FLOP per normalization unit + F32 OPs: + rst: The total number of 32-bit floating-point operations executed on either the + :ref:`VALU ` or :ref:`MFMA ` units, per :ref:`normalization + unit `. + unit: FLOP per normalization unit + F64 OPs: + rst: The total number of 64-bit floating-point operations executed on either the + :ref:`VALU ` or :ref:`MFMA ` units, per :ref:`normalization + unit `. + unit: FLOP per normalization unit + INT8 OPs: + rst: 'The total number of 8-bit integer operations executed on either the :ref:`VALU + ` or :ref:`MFMA ` units, per :ref:`normalization unit + `. Note: on current CDNA accelerators, the VALU has no + native INT8 instructions.' + unit: IOP per normalization unit +Arithmetic operations: + VALU FLOPs: + rst: 'The total floating-point operations executed per second on the :ref:`VALU + `. This is also presented as a percent of the peak theoretical FLOPs + achievable on the specific accelerator. Note: this does not include any floating-point + operations from :ref:`MFMA ` instructions.' + unit: GFLOPs + VALU IOPs: + rst: 'The total integer operations executed per second on the :ref:`VALU `. + This is also presented as a percent of the peak theoretical IOPs achievable + on the specific accelerator. Note: this does not include any integer operations + from :ref:`MFMA ` instructions.' + unit: GIOPs + MFMA FLOPs (BF16): + rst: 'The total number of 16-bit brain floating point :ref:`MFMA ` operations + executed per second. Note: this does not include any 16-bit brain floating + point operations from :ref:`VALU ` instructions. This is also presented + as a percent of the peak theoretical BF16 MFMA operations achievable on the + specific accelerator.' + unit: GFLOPs + MFMA FLOPs (F16): + rst: 'The total number of 16-bit floating point :ref:`MFMA ` operations + executed per second. Note: this does not include any 16-bit floating point + operations from :ref:`VALU ` instructions. This is also presented + as a percent of the peak theoretical F16 MFMA operations achievable on the + specific accelerator.' + unit: GFLOPs + MFMA FLOPs (F32): + rst: 'The total number of 32-bit floating point :ref:`MFMA ` operations + executed per second. Note: this does not include any 32-bit floating point + operations from :ref:`VALU ` instructions. This is also presented + as a percent of the peak theoretical F32 MFMA operations achievable on the + specific accelerator.' + unit: GFLOPs + MFMA FLOPs (F64): + rst: 'The total number of 64-bit floating point :ref:`MFMA ` operations + executed per second. Note: this does not include any 64-bit floating point + operations from :ref:`VALU ` instructions. This is also presented + as a percent of the peak theoretical F64 MFMA operations achievable on the + specific accelerator. The total number of 64-bit floating point :ref:`MFMA + ` operations executed per second. Note: this does not include any + 64-bit floating point operations from :ref:`VALU ` instructions. + This is also presented as a percent of the peak theoretical F64 MFMA operations + achievable on the specific accelerator.' + unit: GFLOPs + MFMA IOPs (INT8): + rst: 'The total number of 8-bit integer :ref:`MFMA ` operations executed + per second. Note: this does not include any 8-bit integer operations from :ref:`VALU + ` instructions. This is also presented as a percent of the peak + theoretical INT8 MFMA operations achievable on the specific accelerator.' + unit: GFLOPs + IPC: + rst: The ratio of the total number of instructions executed on the :doc:`CU ` + over the :ref:`total active CU cycles `. + unit: Instructions per cycle + IPC (Issued): + rst: The ratio of the total number of (non-:ref:`internal `) + instructions issued over the number of cycles where the :ref:`scheduler ` + was actively working on issuing instructions. Refer to the :ref:`Issued IPC + ` example for further detail. + unit: Instructions per cycle + SALU Utilization: + rst: Indicates what percent of the kernel's duration the :ref:`SALU ` + was busy executing instructions. Computed as the ratio of the total number + of cycles spent by the :ref:`scheduler ` issuing SALU / :ref:`SMEM + ` instructions over the :ref:`total CU cycles `. + unit: Percent + VALU Utilization: + rst: Indicates what percent of the kernel's duration the :ref:`VALU ` + was busy executing instructions. Does not include :ref:`VMEM ` operations. + Computed as the ratio of the total number of cycles spent by the :ref:`scheduler + ` issuing VALU instructions over the :ref:`total CU cycles + `. + unit: Percent + VMEM Utilization: + rst: Indicates what percent of the kernel's duration the :ref:`VMEM ` + unit was busy executing instructions, including both global/generic and spill/scratch + operations (see the :ref:`VMEM instruction count metrics ` + for more detail). Does not include :ref:`VALU ` operations. Computed as + the ratio of the total number of cycles spent by the :ref:`scheduler ` + issuing VMEM instructions over the :ref:`total CU cycles `. + unit: Percent + Branch Utilization: + rst: Indicates what percent of the kernel's duration the :ref:`branch ` + unit was busy executing instructions. Computed as the ratio of the total number + of cycles spent by the :ref:`scheduler ` issuing branch instructions + over the :ref:`total CU cycles `. + unit: Percent + VALU Active Threads: + rst: Indicates the average level of :ref:`divergence ` within a + wavefront over the lifetime of the kernel. The number of work-items that were + active in a wavefront during execution of each :ref:`VALU ` instruction, + time-averaged over all VALU instructions run on all wavefronts in the kernel. + unit: Work-items + MFMA Utilization: + rst: Indicates what percent of the kernel's duration the :ref:`MFMA ` + unit was busy executing instructions. Computed as the ratio of the total number + of cycles spent by the :ref:`MFMA ` was busy over the :ref:`total + CU cycles `. + unit: Percent + MFMA Instruction Cycles: + rst: The average duration of :ref:`MFMA ` instructions in this kernel + in cycles. Computed as the ratio of the total number of cycles the MFMA unit + was busy over the total number of MFMA instructions. Compare to, for example, + the `AMD Matrix Instruction Calculator `_. + unit: Cycles per instruction + VMEM Latency: + rst: The average number of round-trip cycles (that is, from issue to data return + / acknowledgment) required for a VMEM instruction to complete. + unit: Cycles + SMEM Latency: + rst: The average number of round-trip cycles (that is, from issue to data return + / acknowledgment) required for a SMEM instruction to complete. + unit: Cycles + FLOPs (Total): + rst: The total number of floating-point operations executed on either the :ref:`VALU + ` or :ref:`MFMA ` units, per :ref:`normalization unit + `. + unit: FLOP per normalization unit + IOPs (Total): + rst: The total number of integer operations executed on either the :ref:`VALU + ` or :ref:`MFMA ` units, per :ref:`normalization unit + `. + unit: IOP per normalization unit + F16 OPs: + rst: The total number of 16-bit floating-point operations executed on either the :ref:`VALU + ` or :ref:`MFMA ` units, per :ref:`normalization unit + `. + unit: FLOP per normalization unit + BF16 OPs: + rst: 'The total number of 16-bit brain floating-point operations executed on either + the :ref:`VALU ` or :ref:`MFMA ` units, per :ref:`normalization + unit `. Note: on current CDNA accelerators, the VALU has + no native BF16 instructions.' + unit: FLOP per normalization unit + F32 OPs: + rst: The total number of 32-bit floating-point operations executed on either the + :ref:`VALU ` or :ref:`MFMA ` units, per :ref:`normalization + unit `. + unit: FLOP per normalization unit + F64 OPs: + rst: The total number of 64-bit floating-point operations executed on either the + :ref:`VALU ` or :ref:`MFMA ` units, per :ref:`normalization + unit `. + unit: FLOP per normalization unit + INT8 OPs: + rst: 'The total number of 8-bit integer operations executed on either the :ref:`VALU + ` or :ref:`MFMA ` units, per :ref:`normalization unit + `. Note: on current CDNA accelerators, the VALU has no + native INT8 instructions.' + unit: IOP per normalization unit +LDS Speed-of-Light: + Utilization: + rst: Indicates what percent of the kernel's duration the :ref:`LDS ` was + actively executing instructions (including, but not limited to, load, store, + atomic and HIP's ``__shfl`` operations). Calculated as the ratio of the total + number of cycles LDS was active over the :ref:`total CU cycles `. + unit: Percent + Access Rate: + rst: Indicates the percentage of SIMDs in the :ref:`VALU ` [#lds-workload]_ + actively issuing LDS instructions, averaged over the lifetime of the kernel. + Calculated as the ratio of the total number of cycles spent by the :ref:`scheduler + ` issuing :ref:`LDS ` instructions over the :ref:`total + CU cycles `. + unit: Percent + Theoretical Bandwidth: + rst: Indicates the maximum amount of bytes that could have been loaded from, stored + to, or atomically updated in the LDS per :ref:`normalization unit `. + Does *not* take into account the execution mask of the wavefront when the instruction + was executed. See the :ref:`LDS bandwidth example ` for more + detail. + unit: Bytes per normalization unit + Bank Conflict Rate: + rst: Indicates the percentage of active LDS cycles that were spent servicing bank + conflicts. Calculated as the ratio of LDS cycles spent servicing bank conflicts + over the number of LDS cycles that would have been required to move the same + amount of data in an uncontended access. [#lds-bank-conflict]_ + unit: Percent + LDS Instructions: + rst: The total number of LDS instructions (including, but not limited to, read/write/atomics + and HIP's ``__shfl`` instructions) executed per :ref:`normalization unit `. + unit: Instructions per normalization unit + LDS Latency: + rst: The average number of round-trip cycles (i.e., from issue to data-return / + acknowledgment) required for an LDS instruction to complete. + unit: Cycles + Bank Conflicts/Access: + rst: The ratio of the number of cycles spent in the :ref:`LDS scheduler ` + due to bank conflicts (as determined by the conflict resolution hardware) to + the base number of cycles that would be spent in the LDS scheduler in a completely + uncontended case. This is the unnormalized form of the Bank Conflict Rate. + unit: Conflicts per Access + Index Accesses: + rst: The total number of cycles spent in the :ref:`LDS scheduler ` over + all operations per :ref:`normalization unit `. + unit: Cycles per normalization unit + Atomic Return Cycles: + rst: The total number of cycles spent on LDS atomics with return per :ref:`normalization + unit `. + unit: Cycles per normalization unit + Bank Conflict: + rst: The total number of cycles spent in the :ref:`LDS scheduler ` due + to bank conflicts (as determined by the conflict resolution hardware) per :ref:`normalization + unit `. + unit: Cycles per normalization unit + Addr Conflict: + rst: The total number of cycles spent in the :ref:`LDS scheduler ` due + to address conflicts (as determined by the conflict resolution hardware) per + :ref:`normalization unit `. + unit: Cycles per normalization unit + Unaligned Stall: + rst: The total number of cycles spent in the :ref:`LDS scheduler ` due + to stalls from non-dword aligned addresses per :ref:`normalization unit `. + unit: Cycles per normalization unit + Mem Violations: + rst: "The total number of out-of-bounds accesses made to the LDS, per :ref:`normalization\ + \ unit `. This is unused and expected to be zero in most\ + \ configurations for modern CDNA\u2122 accelerators." + unit: Accesses per normalization unit +LDS Statistics: + Utilization: + rst: Indicates what percent of the kernel's duration the :ref:`LDS ` was + actively executing instructions (including, but not limited to, load, store, + atomic and HIP's ``__shfl`` operations). Calculated as the ratio of the total + number of cycles LDS was active over the :ref:`total CU cycles `. + unit: Percent + Access Rate: + rst: Indicates the percentage of SIMDs in the :ref:`VALU ` [#lds-workload]_ + actively issuing LDS instructions, averaged over the lifetime of the kernel. + Calculated as the ratio of the total number of cycles spent by the :ref:`scheduler + ` issuing :ref:`LDS ` instructions over the :ref:`total + CU cycles `. + unit: Percent + Theoretical Bandwidth: + rst: Indicates the maximum amount of bytes that could have been loaded from, stored + to, or atomically updated in the LDS per :ref:`normalization unit `. + Does *not* take into account the execution mask of the wavefront when the instruction + was executed. See the :ref:`LDS bandwidth example ` for more + detail. + unit: Bytes per normalization unit + Bank Conflict Rate: + rst: Indicates the percentage of active LDS cycles that were spent servicing bank + conflicts. Calculated as the ratio of LDS cycles spent servicing bank conflicts + over the number of LDS cycles that would have been required to move the same + amount of data in an uncontended access. [#lds-bank-conflict]_ + unit: Percent + LDS Instructions: + rst: The total number of LDS instructions (including, but not limited to, read/write/atomics + and HIP's ``__shfl`` instructions) executed per :ref:`normalization unit `. + unit: Instructions per normalization unit + LDS Latency: + rst: The average number of round-trip cycles (i.e., from issue to data-return / + acknowledgment) required for an LDS instruction to complete. + unit: Cycles + Bank Conflicts/Access: + rst: The ratio of the number of cycles spent in the :ref:`LDS scheduler ` + due to bank conflicts (as determined by the conflict resolution hardware) to + the base number of cycles that would be spent in the LDS scheduler in a completely + uncontended case. This is the unnormalized form of the Bank Conflict Rate. + unit: Conflicts per Access + Index Accesses: + rst: The total number of cycles spent in the :ref:`LDS scheduler ` over + all operations per :ref:`normalization unit `. + unit: Cycles per normalization unit + Atomic Return Cycles: + rst: The total number of cycles spent on LDS atomics with return per :ref:`normalization + unit `. + unit: Cycles per normalization unit + Bank Conflict: + rst: The total number of cycles spent in the :ref:`LDS scheduler ` due + to bank conflicts (as determined by the conflict resolution hardware) per :ref:`normalization + unit `. + unit: Cycles per normalization unit + Addr Conflict: + rst: The total number of cycles spent in the :ref:`LDS scheduler ` due + to address conflicts (as determined by the conflict resolution hardware) per + :ref:`normalization unit `. + unit: Cycles per normalization unit + Unaligned Stall: + rst: The total number of cycles spent in the :ref:`LDS scheduler ` due + to stalls from non-dword aligned addresses per :ref:`normalization unit `. + unit: Cycles per normalization unit + Mem Violations: + rst: "The total number of out-of-bounds accesses made to the LDS, per :ref:`normalization\ + \ unit `. This is unused and expected to be zero in most\ + \ configurations for modern CDNA\u2122 accelerators." + unit: Accesses per normalization unit +vL1D Speed-of-Light: + Hit rate: + rst: The ratio of the number of vL1D cache line requests that hit [#vl1d-hit]_ in + vL1D cache over the total number of cache line requests to the :ref:`vL1D Cache + RAM `. + unit: Percent + Bandwidth: + rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM + ` instructions, as a percent of the peak theoretical bandwidth achievable + on the specific accelerator. The number of bytes is calculated as the number + of cache lines requested multiplied by the cache line size. This value does + not consider partial requests, so for instance, if only a single value is requested + in a cache line, the data movement will still be counted as a full cache line. + unit: Percent + Utilization: + rst: Indicates how busy the :ref:`vL1D Cache RAM ` was during the kernel + execution. The number of cycles where the vL1D Cache RAM is actively processing + any request divided by the number of cycles where the vL1D is active [#vl1d-activity]_. + unit: Percent + Coalescing: + rst: Indicates how well memory instructions were coalesced by the :ref:`address + processing unit `, ranging from uncoalesced (25%) to fully coalesced + (100%). Calculated as the average number of :ref:`thread-requests ` + generated per instruction divided by the ideal number of thread-requests per + instruction. + unit: Percent + Stalled on L2 Data: + rst: The ratio of the number of cycles where the vL1D is stalled waiting for requested + data to return from the :doc:`L2 cache ` divided by the number of + cycles where the vL1D is active [#vl1d-activity]_. + unit: Percent + Stalled on L2 Req: + rst: The ratio of the number of cycles where the vL1D is stalled waiting to issue + a request for data to the :doc:`L2 cache ` divided by the number + of cycles where the vL1D is active [#vl1d-activity]_. + unit: Percent + Tag RAM Stall (Read): + rst: The ratio of the number of cycles where the vL1D is stalled due to Read requests + with conflicting tags being looked up concurrently, divided by the number of + cycles where the vL1D is active [#vl1d-activity]_. + unit: Percent + Tag RAM Stall (Write): + rst: The ratio of the number of cycles where the vL1D is stalled due to Write + requests with conflicting tags being looked up concurrently, divided by the + number of cycles where the vL1D is active [#vl1d-activity]_. + unit: Percent + Tag RAM Stall (Atomic): + rst: The ratio of the number of cycles where the vL1D is stalled due to Atomic + requests with conflicting tags being looked up concurrently, divided by the + number of cycles where the vL1D is active [#vl1d-activity]_. + unit: Percent + Total Req: + rst: The total number of incoming requests from the :ref:`address processing + unit ` after coalescing. + unit: Requests + Read Req: + rst: The total number of incoming read requests from the :ref:`address processing + unit ` after coalescing per :ref:`normalization unit ` + unit: Requests per normalization unit + Write Req: + rst: The total number of incoming write requests from the :ref:`address processing + unit ` after coalescing per :ref:`normalization unit ` + unit: Requests per normalization unit + Atomic Req: + rst: The total number of incoming atomic requests from the :ref:`address processing + unit ` after coalescing per :ref:`normalization unit ` + unit: Requests per normalization unit + Cache BW: + rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM + ` instructions per :ref:`normalization unit `. The + number of bytes is calculated as the number of cache lines requested multiplied + by the cache line size. This value does not consider partial requests, so + for instance, if only a single value is requested in a cache line, the data movement + will still be counted as a full cache line. + unit: Bytes per normalization unit + Cache Hit Rate: + rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache + over the total number of cache line requests to the :ref:`vL1D Cache RAM `. + unit: Percent + Cache Accesses: + rst: The total number of cache line lookups in the vL1D. + unit: Cache lines + Cache Hits: + rst: The number of cache accesses minus the number of outgoing requests to the :doc:`L2 + cache `, that is, the number of cache line requests serviced by the + :ref:`vL1D Cache RAM ` per :ref:`normalization unit `. + unit: Cache lines per normalization unit + Invalidations: + rst: The number of times the vL1D was issued a write-back invalidate command during + the kernel's execution per :ref:`normalization unit `. This + may be triggered by, for instance, the ``buffer_wbinvl1`` instruction. + unit: Invalidations per normalization unit + L1-L2 BW: + rst: The number of bytes transferred across the vL1D-L2 interface as a result of + :ref:`VMEM ` instructions, per :ref:`normalization unit `. + The number of bytes is calculated as the number of cache lines requested multiplied + by the cache line size. This value does not consider partial requests, so for instance, + if only a single value is requested in a cache line, the data movement will + still be counted as a full cache line. + unit: Bytes per normalization unit + L1-L2 Read: + rst: The number of read requests for a vL1D cache line that were not satisfied by + the vL1D and must be retrieved from the to the :doc:`L2 Cache ` per :ref:`normalization + unit `. + unit: Requests per normalization unit + L1-L2 Write: + rst: The number of write requests to a vL1D cache line that were sent through the + vL1D to the :doc:`L2 cache `, per :ref:`normalization unit `. + unit: Requests per normalization unit + L1-L2 Atomic: + rst: The number of atomic requests that are sent through the vL1D to the :doc:`L2 + cache `, per :ref:`normalization unit `. This + includes requests for atomics with, and without return. + unit: Requests per normalization unit + L1 Access Latency: + rst: Calculated as the average number of cycles that a vL1D cache line request + spent in the vL1D cache pipeline. + unit: Cycles + L1-L2 Read Latency: + rst: Calculated as the average number of cycles that the vL1D cache took to issue + and receive read requests from the :doc:`L2 Cache `. This number + also includes requests for atomics with return values. + unit: Cycles + L1-L2 Write Latency: + rst: Calculated as the average number of cycles that the vL1D cache took to issue + and receive acknowledgement of a write request to the :doc:`L2 Cache `. + This number also includes requests for atomics without return values. + unit: Cycles + NC - Read: + rst: Total read requests with NC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + UC - Read: + rst: Total read requests with UC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + CC - Read: + rst: Total read requests with CC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + RW - Read: + rst: '' + unit: Requests per normalization unit + RW - Write: + rst: Total write requests with RW mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + NC - Write: + rst: Total write requests with NC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + UC - Write: + rst: Total write requests with UC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + CC - Write: + rst: Total write requests with CC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + NC - Atomic: + rst: Total atomic requests with NC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + UC - Atomic: + rst: Total atomic requests with UC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + CC - Atomic: + rst: Total atomic requests with CC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + RW - Atomic: + rst: Total atomic requests with RW mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + Req: + rst: The number of translation requests made to the UTCL1 per normalization unit. + unit: Requests per normalization unit + Hit Ratio: + rst: The ratio of the number of translation requests that hit in the UTCL1 divided + by the total number of translation requests made to the UTCL1. + unit: Percent + Hits: + rst: The number of translation requests that hit in the UTCL1, and could be reused, + per normalization unit. + unit: Requests per normalization unit + Translation Misses: + rst: The total number of translation requests that missed in the UTCL1 due to translation + not being present in the cache, per :ref:`normalization unit `. + unit: unit + Permission Misses: + rst: "The total number of translation requests that missed in the UTCL1 due to\ + \ a permission error, per :ref:`normalization unit `.\ + \ This is unused and expected to be zero in most configurations for modern\ + \ CDNA\u2122 accelerators." + unit: Requests per normalization unit +Busy / stall metrics: + Address Processing Unit Busy: + rst: Percent of the :ref:`total CU cycles ` the address processor + was busy + unit: Percent + Address Stall: + rst: Percent of the :ref:`total CU cycles ` the address processor + was stalled from sending address requests further into the vL1D pipeline + unit: Percent + Data Stall: + rst: Percent of the :ref:`total CU cycles ` the address processor + was stalled from sending write/atomic data further into the vL1D pipeline + unit: Percent + "Data-Processor \u2192 Address Stall": + rst: Percent of :ref:`total CU cycles ` the address processor was + stalled waiting to send command data to the :ref:`data processor ` + unit: Percent + Total Instructions: + rst: The total number of memory instructions executed by the address processer + over all compute units on the accelerator, per normalization unit. + unit: Instructions per normalization unit + Global/Generic Instructions: + rst: The total number of global & generic memory instructions executed on all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + unit: Instructions per normalization unit + Global/Generic Read Instructions: + rst: The total number of global & generic memory read instructions executed on all + :doc:`compute units ` on the accelerator, per :ref:`normalization + unit `. + unit: Instructions per normalization unit + Global/Generic Write Instructions: + rst: The total number of global & generic memory write instructions executed on + all :doc:`compute units ` on the accelerator, per :ref:`normalization + unit `. + unit: Instructions per normalization unit + Global/Generic Atomic Instructions: + rst: The total number of global & generic memory atomic (with and without return) + instructions executed on all :doc:`compute units ` on the accelerator, + per :ref:`normalization unit `. + unit: Instructions per normalization unit + Spill/Stack Instructions: + rst: The total number of spill/stack memory instructions executed on all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + unit: Instructions per normalization unit + Spill/Stack Read Instructions: + rst: The total number of spill/stack memory read instructions executed on all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + unit: Instructions per normalization unit + Spill/Stack Write Instructions: + rst: The total number of spill/stack memory write instructions executed on all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + unit: Instructions per normalization unit + Spill/Stack Atomic Instructions: + rst: The total number of spill/stack memory atomic (with and without return) instructions + executed on all :doc:`compute units ` on the accelerator, per + :ref:`normalization unit `. Typically unused as these + memory operations are typically used to implement thread-local storage. + unit: Instructions per normalization unit + Spill/Stack Total Cycles: + rst: The number of cycles the address processing unit spent working on spill/stack + instructions, per :ref:`normalization unit `. + unit: Cycles per normalization unit + Spill/Stack Coalesced Read: + rst: The number of cycles the address processing unit spent working on coalesced + spill/stack read instructions, per :ref:`normalization unit `. + unit: Cycles per normalization unit + Spill/Stack Coalesced Write: + rst: The number of cycles the address processing unit spent working on coalesced + spill/stack write instructions, per :ref:`normalization unit `. + unit: Cycles per normalization unit + Data-Return Busy: + rst: Percent of the :ref:`total CU cycles ` the data-return unit + was busy processing or waiting on data to return to the :doc:`CU `. + unit: Percent + "Cache RAM \u2192 Data-Return Stall": + rst: Percent of the :ref:`total CU cycles ` the data-return unit + was stalled on data to be returned from the :ref:`vL1D Cache RAM `. + unit: Percent + "Workgroup manager \u2192 Data-Return Stall": + rst: Percent of the :ref:`total CU cycles ` the data-return unit + was stalled by the :ref:`workgroup manager ` due to initialization + of registers as a part of launching new workgroups. + unit: Percent + Coalescable Instructions: + rst: The number of instructions submitted to the :ref:`data-return unit ` + by the :ref:`address processor ` that were found to be coalescable, + per :ref:`normalization unit `. + unit: Instructions per normalization unit + Read Instructions: + rst: The number of read instructions submitted to the :ref:`data-return unit + ` by the :ref:`address processor ` summed over all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + This is expected to be the sum of global/generic and spill/stack reads in the :ref:`address + processor `. + unit: Instructions per normalization unit + Write Instructions: + rst: The number of store instructions submitted to the :ref:`data-return unit + ` by the :ref:`address processor ` summed over all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + This is expected to be the sum of global/generic and spill/stack stores counted + by the :ref:`vL1D cache-front-end `. + unit: Instructions per normalization unit + Atomic Instructions: + rst: The number of atomic instructions submitted to the :ref:`data-return unit + ` by the :ref:`address processor ` summed over all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + This is expected to be the sum of global/generic and spill/stack atomics in + the :ref:`address processor `. + unit: Instructions per normalization unit +Instruction counts: + Address Processing Unit Busy: + rst: Percent of the :ref:`total CU cycles ` the address processor + was busy + unit: Percent + Address Stall: + rst: Percent of the :ref:`total CU cycles ` the address processor + was stalled from sending address requests further into the vL1D pipeline + unit: Percent + Data Stall: + rst: Percent of the :ref:`total CU cycles ` the address processor + was stalled from sending write/atomic data further into the vL1D pipeline + unit: Percent + "Data-Processor \u2192 Address Stall": + rst: Percent of :ref:`total CU cycles ` the address processor was + stalled waiting to send command data to the :ref:`data processor ` + unit: Percent + Total Instructions: + rst: The total number of memory instructions executed by the address processer + over all compute units on the accelerator, per normalization unit. + unit: Instructions per normalization unit + Global/Generic Instructions: + rst: The total number of global & generic memory instructions executed on all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + unit: Instructions per normalization unit + Global/Generic Read Instructions: + rst: The total number of global & generic memory read instructions executed on all + :doc:`compute units ` on the accelerator, per :ref:`normalization + unit `. + unit: Instructions per normalization unit + Global/Generic Write Instructions: + rst: The total number of global & generic memory write instructions executed on + all :doc:`compute units ` on the accelerator, per :ref:`normalization + unit `. + unit: Instructions per normalization unit + Global/Generic Atomic Instructions: + rst: The total number of global & generic memory atomic (with and without return) + instructions executed on all :doc:`compute units ` on the accelerator, + per :ref:`normalization unit `. + unit: Instructions per normalization unit + Spill/Stack Instructions: + rst: The total number of spill/stack memory instructions executed on all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + unit: Instructions per normalization unit + Spill/Stack Read Instructions: + rst: The total number of spill/stack memory read instructions executed on all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + unit: Instructions per normalization unit + Spill/Stack Write Instructions: + rst: The total number of spill/stack memory write instructions executed on all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + unit: Instructions per normalization unit + Spill/Stack Atomic Instructions: + rst: The total number of spill/stack memory atomic (with and without return) instructions + executed on all :doc:`compute units ` on the accelerator, per + :ref:`normalization unit `. Typically unused as these + memory operations are typically used to implement thread-local storage. + unit: Instructions per normalization unit + Spill/Stack Total Cycles: + rst: The number of cycles the address processing unit spent working on spill/stack + instructions, per :ref:`normalization unit `. + unit: Cycles per normalization unit + Spill/Stack Coalesced Read: + rst: The number of cycles the address processing unit spent working on coalesced + spill/stack read instructions, per :ref:`normalization unit `. + unit: Cycles per normalization unit + Spill/Stack Coalesced Write: + rst: The number of cycles the address processing unit spent working on coalesced + spill/stack write instructions, per :ref:`normalization unit `. + unit: Cycles per normalization unit + Data-Return Busy: + rst: Percent of the :ref:`total CU cycles ` the data-return unit + was busy processing or waiting on data to return to the :doc:`CU `. + unit: Percent + "Cache RAM \u2192 Data-Return Stall": + rst: Percent of the :ref:`total CU cycles ` the data-return unit + was stalled on data to be returned from the :ref:`vL1D Cache RAM `. + unit: Percent + "Workgroup manager \u2192 Data-Return Stall": + rst: Percent of the :ref:`total CU cycles ` the data-return unit + was stalled by the :ref:`workgroup manager ` due to initialization + of registers as a part of launching new workgroups. + unit: Percent + Coalescable Instructions: + rst: The number of instructions submitted to the :ref:`data-return unit ` + by the :ref:`address processor ` that were found to be coalescable, + per :ref:`normalization unit `. + unit: Instructions per normalization unit + Read Instructions: + rst: The number of read instructions submitted to the :ref:`data-return unit + ` by the :ref:`address processor ` summed over all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + This is expected to be the sum of global/generic and spill/stack reads in the :ref:`address + processor `. + unit: Instructions per normalization unit + Write Instructions: + rst: The number of store instructions submitted to the :ref:`data-return unit + ` by the :ref:`address processor ` summed over all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + This is expected to be the sum of global/generic and spill/stack stores counted + by the :ref:`vL1D cache-front-end `. + unit: Instructions per normalization unit + Atomic Instructions: + rst: The number of atomic instructions submitted to the :ref:`data-return unit + ` by the :ref:`address processor ` summed over all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + This is expected to be the sum of global/generic and spill/stack atomics in + the :ref:`address processor `. + unit: Instructions per normalization unit +Spill / stack metrics: + Address Processing Unit Busy: + rst: Percent of the :ref:`total CU cycles ` the address processor + was busy + unit: Percent + Address Stall: + rst: Percent of the :ref:`total CU cycles ` the address processor + was stalled from sending address requests further into the vL1D pipeline + unit: Percent + Data Stall: + rst: Percent of the :ref:`total CU cycles ` the address processor + was stalled from sending write/atomic data further into the vL1D pipeline + unit: Percent + "Data-Processor \u2192 Address Stall": + rst: Percent of :ref:`total CU cycles ` the address processor was + stalled waiting to send command data to the :ref:`data processor ` + unit: Percent + Total Instructions: + rst: The total number of memory instructions executed by the address processer + over all compute units on the accelerator, per normalization unit. + unit: Instructions per normalization unit + Global/Generic Instructions: + rst: The total number of global & generic memory instructions executed on all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + unit: Instructions per normalization unit + Global/Generic Read Instructions: + rst: The total number of global & generic memory read instructions executed on all + :doc:`compute units ` on the accelerator, per :ref:`normalization + unit `. + unit: Instructions per normalization unit + Global/Generic Write Instructions: + rst: The total number of global & generic memory write instructions executed on + all :doc:`compute units ` on the accelerator, per :ref:`normalization + unit `. + unit: Instructions per normalization unit + Global/Generic Atomic Instructions: + rst: The total number of global & generic memory atomic (with and without return) + instructions executed on all :doc:`compute units ` on the accelerator, + per :ref:`normalization unit `. + unit: Instructions per normalization unit + Spill/Stack Instructions: + rst: The total number of spill/stack memory instructions executed on all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + unit: Instructions per normalization unit + Spill/Stack Read Instructions: + rst: The total number of spill/stack memory read instructions executed on all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + unit: Instructions per normalization unit + Spill/Stack Write Instructions: + rst: The total number of spill/stack memory write instructions executed on all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + unit: Instructions per normalization unit + Spill/Stack Atomic Instructions: + rst: The total number of spill/stack memory atomic (with and without return) instructions + executed on all :doc:`compute units ` on the accelerator, per + :ref:`normalization unit `. Typically unused as these + memory operations are typically used to implement thread-local storage. + unit: Instructions per normalization unit + Spill/Stack Total Cycles: + rst: The number of cycles the address processing unit spent working on spill/stack + instructions, per :ref:`normalization unit `. + unit: Cycles per normalization unit + Spill/Stack Coalesced Read: + rst: The number of cycles the address processing unit spent working on coalesced + spill/stack read instructions, per :ref:`normalization unit `. + unit: Cycles per normalization unit + Spill/Stack Coalesced Write: + rst: The number of cycles the address processing unit spent working on coalesced + spill/stack write instructions, per :ref:`normalization unit `. + unit: Cycles per normalization unit + Data-Return Busy: + rst: Percent of the :ref:`total CU cycles ` the data-return unit + was busy processing or waiting on data to return to the :doc:`CU `. + unit: Percent + "Cache RAM \u2192 Data-Return Stall": + rst: Percent of the :ref:`total CU cycles ` the data-return unit + was stalled on data to be returned from the :ref:`vL1D Cache RAM `. + unit: Percent + "Workgroup manager \u2192 Data-Return Stall": + rst: Percent of the :ref:`total CU cycles ` the data-return unit + was stalled by the :ref:`workgroup manager ` due to initialization + of registers as a part of launching new workgroups. + unit: Percent + Coalescable Instructions: + rst: The number of instructions submitted to the :ref:`data-return unit ` + by the :ref:`address processor ` that were found to be coalescable, + per :ref:`normalization unit `. + unit: Instructions per normalization unit + Read Instructions: + rst: The number of read instructions submitted to the :ref:`data-return unit + ` by the :ref:`address processor ` summed over all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + This is expected to be the sum of global/generic and spill/stack reads in the :ref:`address + processor `. + unit: Instructions per normalization unit + Write Instructions: + rst: The number of store instructions submitted to the :ref:`data-return unit + ` by the :ref:`address processor ` summed over all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + This is expected to be the sum of global/generic and spill/stack stores counted + by the :ref:`vL1D cache-front-end `. + unit: Instructions per normalization unit + Atomic Instructions: + rst: The number of atomic instructions submitted to the :ref:`data-return unit + ` by the :ref:`address processor ` summed over all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + This is expected to be the sum of global/generic and spill/stack atomics in + the :ref:`address processor `. + unit: Instructions per normalization unit +L1 Unified Translation Cache (UTCL1): + Hit rate: + rst: The ratio of the number of vL1D cache line requests that hit [#vl1d-hit]_ in + vL1D cache over the total number of cache line requests to the :ref:`vL1D Cache + RAM `. + unit: Percent + Bandwidth: + rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM + ` instructions, as a percent of the peak theoretical bandwidth achievable + on the specific accelerator. The number of bytes is calculated as the number + of cache lines requested multiplied by the cache line size. This value does + not consider partial requests, so for instance, if only a single value is requested + in a cache line, the data movement will still be counted as a full cache line. + unit: Percent + Utilization: + rst: Indicates how busy the :ref:`vL1D Cache RAM ` was during the kernel + execution. The number of cycles where the vL1D Cache RAM is actively processing + any request divided by the number of cycles where the vL1D is active [#vl1d-activity]_. + unit: Percent + Coalescing: + rst: Indicates how well memory instructions were coalesced by the :ref:`address + processing unit `, ranging from uncoalesced (25%) to fully coalesced + (100%). Calculated as the average number of :ref:`thread-requests ` + generated per instruction divided by the ideal number of thread-requests per + instruction. + unit: Percent + Stalled on L2 Data: + rst: The ratio of the number of cycles where the vL1D is stalled waiting for requested + data to return from the :doc:`L2 cache ` divided by the number of + cycles where the vL1D is active [#vl1d-activity]_. + unit: Percent + Stalled on L2 Req: + rst: The ratio of the number of cycles where the vL1D is stalled waiting to issue + a request for data to the :doc:`L2 cache ` divided by the number + of cycles where the vL1D is active [#vl1d-activity]_. + unit: Percent + Tag RAM Stall (Read): + rst: The ratio of the number of cycles where the vL1D is stalled due to Read requests + with conflicting tags being looked up concurrently, divided by the number of + cycles where the vL1D is active [#vl1d-activity]_. + unit: Percent + Tag RAM Stall (Write): + rst: The ratio of the number of cycles where the vL1D is stalled due to Write + requests with conflicting tags being looked up concurrently, divided by the + number of cycles where the vL1D is active [#vl1d-activity]_. + unit: Percent + Tag RAM Stall (Atomic): + rst: The ratio of the number of cycles where the vL1D is stalled due to Atomic + requests with conflicting tags being looked up concurrently, divided by the + number of cycles where the vL1D is active [#vl1d-activity]_. + unit: Percent + Total Req: + rst: The total number of incoming requests from the :ref:`address processing + unit ` after coalescing. + unit: Requests + Read Req: + rst: The total number of incoming read requests from the :ref:`address processing + unit ` after coalescing per :ref:`normalization unit ` + unit: Requests per normalization unit + Write Req: + rst: The total number of incoming write requests from the :ref:`address processing + unit ` after coalescing per :ref:`normalization unit ` + unit: Requests per normalization unit + Atomic Req: + rst: The total number of incoming atomic requests from the :ref:`address processing + unit ` after coalescing per :ref:`normalization unit ` + unit: Requests per normalization unit + Cache BW: + rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM + ` instructions per :ref:`normalization unit `. The + number of bytes is calculated as the number of cache lines requested multiplied + by the cache line size. This value does not consider partial requests, so + for instance, if only a single value is requested in a cache line, the data movement + will still be counted as a full cache line. + unit: Bytes per normalization unit + Cache Hit Rate: + rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache + over the total number of cache line requests to the :ref:`vL1D Cache RAM `. + unit: Percent + Cache Accesses: + rst: The total number of cache line lookups in the vL1D. + unit: Cache lines + Cache Hits: + rst: The number of cache accesses minus the number of outgoing requests to the :doc:`L2 + cache `, that is, the number of cache line requests serviced by the + :ref:`vL1D Cache RAM ` per :ref:`normalization unit `. + unit: Cache lines per normalization unit + Invalidations: + rst: The number of times the vL1D was issued a write-back invalidate command during + the kernel's execution per :ref:`normalization unit `. This + may be triggered by, for instance, the ``buffer_wbinvl1`` instruction. + unit: Invalidations per normalization unit + L1-L2 BW: + rst: The number of bytes transferred across the vL1D-L2 interface as a result of + :ref:`VMEM ` instructions, per :ref:`normalization unit `. + The number of bytes is calculated as the number of cache lines requested multiplied + by the cache line size. This value does not consider partial requests, so for instance, + if only a single value is requested in a cache line, the data movement will + still be counted as a full cache line. + unit: Bytes per normalization unit + L1-L2 Read: + rst: The number of read requests for a vL1D cache line that were not satisfied by + the vL1D and must be retrieved from the to the :doc:`L2 Cache ` per :ref:`normalization + unit `. + unit: Requests per normalization unit + L1-L2 Write: + rst: The number of write requests to a vL1D cache line that were sent through the + vL1D to the :doc:`L2 cache `, per :ref:`normalization unit `. + unit: Requests per normalization unit + L1-L2 Atomic: + rst: The number of atomic requests that are sent through the vL1D to the :doc:`L2 + cache `, per :ref:`normalization unit `. This + includes requests for atomics with, and without return. + unit: Requests per normalization unit + L1 Access Latency: + rst: Calculated as the average number of cycles that a vL1D cache line request + spent in the vL1D cache pipeline. + unit: Cycles + L1-L2 Read Latency: + rst: Calculated as the average number of cycles that the vL1D cache took to issue + and receive read requests from the :doc:`L2 Cache `. This number + also includes requests for atomics with return values. + unit: Cycles + L1-L2 Write Latency: + rst: Calculated as the average number of cycles that the vL1D cache took to issue + and receive acknowledgement of a write request to the :doc:`L2 Cache `. + This number also includes requests for atomics without return values. + unit: Cycles + NC - Read: + rst: Total read requests with NC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + UC - Read: + rst: Total read requests with UC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + CC - Read: + rst: Total read requests with CC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + RW - Read: + rst: '' + unit: Requests per normalization unit + RW - Write: + rst: Total write requests with RW mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + NC - Write: + rst: Total write requests with NC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + UC - Write: + rst: Total write requests with UC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + CC - Write: + rst: Total write requests with CC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + NC - Atomic: + rst: Total atomic requests with NC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + UC - Atomic: + rst: Total atomic requests with UC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + CC - Atomic: + rst: Total atomic requests with CC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + RW - Atomic: + rst: Total atomic requests with RW mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + Req: + rst: The number of translation requests made to the UTCL1 per normalization unit. + unit: Requests per normalization unit + Hit Ratio: + rst: The ratio of the number of translation requests that hit in the UTCL1 divided + by the total number of translation requests made to the UTCL1. + unit: Percent + Hits: + rst: The number of translation requests that hit in the UTCL1, and could be reused, + per normalization unit. + unit: Requests per normalization unit + Translation Misses: + rst: The total number of translation requests that missed in the UTCL1 due to translation + not being present in the cache, per :ref:`normalization unit `. + unit: unit + Permission Misses: + rst: "The total number of translation requests that missed in the UTCL1 due to\ + \ a permission error, per :ref:`normalization unit `.\ + \ This is unused and expected to be zero in most configurations for modern\ + \ CDNA\u2122 accelerators." + unit: Requests per normalization unit +vL1D cache stall metrics: + Hit rate: + rst: The ratio of the number of vL1D cache line requests that hit [#vl1d-hit]_ in + vL1D cache over the total number of cache line requests to the :ref:`vL1D Cache + RAM `. + unit: Percent + Bandwidth: + rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM + ` instructions, as a percent of the peak theoretical bandwidth achievable + on the specific accelerator. The number of bytes is calculated as the number + of cache lines requested multiplied by the cache line size. This value does + not consider partial requests, so for instance, if only a single value is requested + in a cache line, the data movement will still be counted as a full cache line. + unit: Percent + Utilization: + rst: Indicates how busy the :ref:`vL1D Cache RAM ` was during the kernel + execution. The number of cycles where the vL1D Cache RAM is actively processing + any request divided by the number of cycles where the vL1D is active [#vl1d-activity]_. + unit: Percent + Coalescing: + rst: Indicates how well memory instructions were coalesced by the :ref:`address + processing unit `, ranging from uncoalesced (25%) to fully coalesced + (100%). Calculated as the average number of :ref:`thread-requests ` + generated per instruction divided by the ideal number of thread-requests per + instruction. + unit: Percent + Stalled on L2 Data: + rst: The ratio of the number of cycles where the vL1D is stalled waiting for requested + data to return from the :doc:`L2 cache ` divided by the number of + cycles where the vL1D is active [#vl1d-activity]_. + unit: Percent + Stalled on L2 Req: + rst: The ratio of the number of cycles where the vL1D is stalled waiting to issue + a request for data to the :doc:`L2 cache ` divided by the number + of cycles where the vL1D is active [#vl1d-activity]_. + unit: Percent + Tag RAM Stall (Read): + rst: The ratio of the number of cycles where the vL1D is stalled due to Read requests + with conflicting tags being looked up concurrently, divided by the number of + cycles where the vL1D is active [#vl1d-activity]_. + unit: Percent + Tag RAM Stall (Write): + rst: The ratio of the number of cycles where the vL1D is stalled due to Write + requests with conflicting tags being looked up concurrently, divided by the + number of cycles where the vL1D is active [#vl1d-activity]_. + unit: Percent + Tag RAM Stall (Atomic): + rst: The ratio of the number of cycles where the vL1D is stalled due to Atomic + requests with conflicting tags being looked up concurrently, divided by the + number of cycles where the vL1D is active [#vl1d-activity]_. + unit: Percent + Total Req: + rst: The total number of incoming requests from the :ref:`address processing + unit ` after coalescing. + unit: Requests + Read Req: + rst: The total number of incoming read requests from the :ref:`address processing + unit ` after coalescing per :ref:`normalization unit ` + unit: Requests per normalization unit + Write Req: + rst: The total number of incoming write requests from the :ref:`address processing + unit ` after coalescing per :ref:`normalization unit ` + unit: Requests per normalization unit + Atomic Req: + rst: The total number of incoming atomic requests from the :ref:`address processing + unit ` after coalescing per :ref:`normalization unit ` + unit: Requests per normalization unit + Cache BW: + rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM + ` instructions per :ref:`normalization unit `. The + number of bytes is calculated as the number of cache lines requested multiplied + by the cache line size. This value does not consider partial requests, so + for instance, if only a single value is requested in a cache line, the data movement + will still be counted as a full cache line. + unit: Bytes per normalization unit + Cache Hit Rate: + rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache + over the total number of cache line requests to the :ref:`vL1D Cache RAM `. + unit: Percent + Cache Accesses: + rst: The total number of cache line lookups in the vL1D. + unit: Cache lines + Cache Hits: + rst: The number of cache accesses minus the number of outgoing requests to the :doc:`L2 + cache `, that is, the number of cache line requests serviced by the + :ref:`vL1D Cache RAM ` per :ref:`normalization unit `. + unit: Cache lines per normalization unit + Invalidations: + rst: The number of times the vL1D was issued a write-back invalidate command during + the kernel's execution per :ref:`normalization unit `. This + may be triggered by, for instance, the ``buffer_wbinvl1`` instruction. + unit: Invalidations per normalization unit + L1-L2 BW: + rst: The number of bytes transferred across the vL1D-L2 interface as a result of + :ref:`VMEM ` instructions, per :ref:`normalization unit `. + The number of bytes is calculated as the number of cache lines requested multiplied + by the cache line size. This value does not consider partial requests, so for instance, + if only a single value is requested in a cache line, the data movement will + still be counted as a full cache line. + unit: Bytes per normalization unit + L1-L2 Read: + rst: The number of read requests for a vL1D cache line that were not satisfied by + the vL1D and must be retrieved from the to the :doc:`L2 Cache ` per :ref:`normalization + unit `. + unit: Requests per normalization unit + L1-L2 Write: + rst: The number of write requests to a vL1D cache line that were sent through the + vL1D to the :doc:`L2 cache `, per :ref:`normalization unit `. + unit: Requests per normalization unit + L1-L2 Atomic: + rst: The number of atomic requests that are sent through the vL1D to the :doc:`L2 + cache `, per :ref:`normalization unit `. This + includes requests for atomics with, and without return. + unit: Requests per normalization unit + L1 Access Latency: + rst: Calculated as the average number of cycles that a vL1D cache line request + spent in the vL1D cache pipeline. + unit: Cycles + L1-L2 Read Latency: + rst: Calculated as the average number of cycles that the vL1D cache took to issue + and receive read requests from the :doc:`L2 Cache `. This number + also includes requests for atomics with return values. + unit: Cycles + L1-L2 Write Latency: + rst: Calculated as the average number of cycles that the vL1D cache took to issue + and receive acknowledgement of a write request to the :doc:`L2 Cache `. + This number also includes requests for atomics without return values. + unit: Cycles + NC - Read: + rst: Total read requests with NC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + UC - Read: + rst: Total read requests with UC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + CC - Read: + rst: Total read requests with CC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + RW - Read: + rst: '' + unit: Requests per normalization unit + RW - Write: + rst: Total write requests with RW mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + NC - Write: + rst: Total write requests with NC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + UC - Write: + rst: Total write requests with UC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + CC - Write: + rst: Total write requests with CC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + NC - Atomic: + rst: Total atomic requests with NC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + UC - Atomic: + rst: Total atomic requests with UC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + CC - Atomic: + rst: Total atomic requests with CC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + RW - Atomic: + rst: Total atomic requests with RW mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + Req: + rst: The number of translation requests made to the UTCL1 per normalization unit. + unit: Requests per normalization unit + Hit Ratio: + rst: The ratio of the number of translation requests that hit in the UTCL1 divided + by the total number of translation requests made to the UTCL1. + unit: Percent + Hits: + rst: The number of translation requests that hit in the UTCL1, and could be reused, + per normalization unit. + unit: Requests per normalization unit + Translation Misses: + rst: The total number of translation requests that missed in the UTCL1 due to translation + not being present in the cache, per :ref:`normalization unit `. + unit: unit + Permission Misses: + rst: "The total number of translation requests that missed in the UTCL1 due to\ + \ a permission error, per :ref:`normalization unit `.\ + \ This is unused and expected to be zero in most configurations for modern\ + \ CDNA\u2122 accelerators." + unit: Requests per normalization unit +vL1D cache access metrics: + Hit rate: + rst: The ratio of the number of vL1D cache line requests that hit [#vl1d-hit]_ in + vL1D cache over the total number of cache line requests to the :ref:`vL1D Cache + RAM `. + unit: Percent + Bandwidth: + rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM + ` instructions, as a percent of the peak theoretical bandwidth achievable + on the specific accelerator. The number of bytes is calculated as the number + of cache lines requested multiplied by the cache line size. This value does + not consider partial requests, so for instance, if only a single value is requested + in a cache line, the data movement will still be counted as a full cache line. + unit: Percent + Utilization: + rst: Indicates how busy the :ref:`vL1D Cache RAM ` was during the kernel + execution. The number of cycles where the vL1D Cache RAM is actively processing + any request divided by the number of cycles where the vL1D is active [#vl1d-activity]_. + unit: Percent + Coalescing: + rst: Indicates how well memory instructions were coalesced by the :ref:`address + processing unit `, ranging from uncoalesced (25%) to fully coalesced + (100%). Calculated as the average number of :ref:`thread-requests ` + generated per instruction divided by the ideal number of thread-requests per + instruction. + unit: Percent + Stalled on L2 Data: + rst: The ratio of the number of cycles where the vL1D is stalled waiting for requested + data to return from the :doc:`L2 cache ` divided by the number of + cycles where the vL1D is active [#vl1d-activity]_. + unit: Percent + Stalled on L2 Req: + rst: The ratio of the number of cycles where the vL1D is stalled waiting to issue + a request for data to the :doc:`L2 cache ` divided by the number + of cycles where the vL1D is active [#vl1d-activity]_. + unit: Percent + Tag RAM Stall (Read): + rst: The ratio of the number of cycles where the vL1D is stalled due to Read requests + with conflicting tags being looked up concurrently, divided by the number of + cycles where the vL1D is active [#vl1d-activity]_. + unit: Percent + Tag RAM Stall (Write): + rst: The ratio of the number of cycles where the vL1D is stalled due to Write + requests with conflicting tags being looked up concurrently, divided by the + number of cycles where the vL1D is active [#vl1d-activity]_. + unit: Percent + Tag RAM Stall (Atomic): + rst: The ratio of the number of cycles where the vL1D is stalled due to Atomic + requests with conflicting tags being looked up concurrently, divided by the + number of cycles where the vL1D is active [#vl1d-activity]_. + unit: Percent + Total Req: + rst: The total number of incoming requests from the :ref:`address processing + unit ` after coalescing. + unit: Requests + Read Req: + rst: The total number of incoming read requests from the :ref:`address processing + unit ` after coalescing per :ref:`normalization unit ` + unit: Requests per normalization unit + Write Req: + rst: The total number of incoming write requests from the :ref:`address processing + unit ` after coalescing per :ref:`normalization unit ` + unit: Requests per normalization unit + Atomic Req: + rst: The total number of incoming atomic requests from the :ref:`address processing + unit ` after coalescing per :ref:`normalization unit ` + unit: Requests per normalization unit + Cache BW: + rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM + ` instructions per :ref:`normalization unit `. The + number of bytes is calculated as the number of cache lines requested multiplied + by the cache line size. This value does not consider partial requests, so + for instance, if only a single value is requested in a cache line, the data movement + will still be counted as a full cache line. + unit: Bytes per normalization unit + Cache Hit Rate: + rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache + over the total number of cache line requests to the :ref:`vL1D Cache RAM `. + unit: Percent + Cache Accesses: + rst: The total number of cache line lookups in the vL1D. + unit: Cache lines + Cache Hits: + rst: The number of cache accesses minus the number of outgoing requests to the :doc:`L2 + cache `, that is, the number of cache line requests serviced by the + :ref:`vL1D Cache RAM ` per :ref:`normalization unit `. + unit: Cache lines per normalization unit + Invalidations: + rst: The number of times the vL1D was issued a write-back invalidate command during + the kernel's execution per :ref:`normalization unit `. This + may be triggered by, for instance, the ``buffer_wbinvl1`` instruction. + unit: Invalidations per normalization unit + L1-L2 BW: + rst: The number of bytes transferred across the vL1D-L2 interface as a result of + :ref:`VMEM ` instructions, per :ref:`normalization unit `. + The number of bytes is calculated as the number of cache lines requested multiplied + by the cache line size. This value does not consider partial requests, so for instance, + if only a single value is requested in a cache line, the data movement will + still be counted as a full cache line. + unit: Bytes per normalization unit + L1-L2 Read: + rst: The number of read requests for a vL1D cache line that were not satisfied by + the vL1D and must be retrieved from the to the :doc:`L2 Cache ` per :ref:`normalization + unit `. + unit: Requests per normalization unit + L1-L2 Write: + rst: The number of write requests to a vL1D cache line that were sent through the + vL1D to the :doc:`L2 cache `, per :ref:`normalization unit `. + unit: Requests per normalization unit + L1-L2 Atomic: + rst: The number of atomic requests that are sent through the vL1D to the :doc:`L2 + cache `, per :ref:`normalization unit `. This + includes requests for atomics with, and without return. + unit: Requests per normalization unit + L1 Access Latency: + rst: Calculated as the average number of cycles that a vL1D cache line request + spent in the vL1D cache pipeline. + unit: Cycles + L1-L2 Read Latency: + rst: Calculated as the average number of cycles that the vL1D cache took to issue + and receive read requests from the :doc:`L2 Cache `. This number + also includes requests for atomics with return values. + unit: Cycles + L1-L2 Write Latency: + rst: Calculated as the average number of cycles that the vL1D cache took to issue + and receive acknowledgement of a write request to the :doc:`L2 Cache `. + This number also includes requests for atomics without return values. + unit: Cycles + NC - Read: + rst: Total read requests with NC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + UC - Read: + rst: Total read requests with UC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + CC - Read: + rst: Total read requests with CC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + RW - Read: + rst: '' + unit: Requests per normalization unit + RW - Write: + rst: Total write requests with RW mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + NC - Write: + rst: Total write requests with NC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + UC - Write: + rst: Total write requests with UC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + CC - Write: + rst: Total write requests with CC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + NC - Atomic: + rst: Total atomic requests with NC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + UC - Atomic: + rst: Total atomic requests with UC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + CC - Atomic: + rst: Total atomic requests with CC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + RW - Atomic: + rst: Total atomic requests with RW mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + Req: + rst: The number of translation requests made to the UTCL1 per normalization unit. + unit: Requests per normalization unit + Hit Ratio: + rst: The ratio of the number of translation requests that hit in the UTCL1 divided + by the total number of translation requests made to the UTCL1. + unit: Percent + Hits: + rst: The number of translation requests that hit in the UTCL1, and could be reused, + per normalization unit. + unit: Requests per normalization unit + Translation Misses: + rst: The total number of translation requests that missed in the UTCL1 due to translation + not being present in the cache, per :ref:`normalization unit `. + unit: unit + Permission Misses: + rst: "The total number of translation requests that missed in the UTCL1 due to\ + \ a permission error, per :ref:`normalization unit `.\ + \ This is unused and expected to be zero in most configurations for modern\ + \ CDNA\u2122 accelerators." + unit: Requests per normalization unit +Vector L1 data-return path or Texture Data (TD): + Address Processing Unit Busy: + rst: Percent of the :ref:`total CU cycles ` the address processor + was busy + unit: Percent + Address Stall: + rst: Percent of the :ref:`total CU cycles ` the address processor + was stalled from sending address requests further into the vL1D pipeline + unit: Percent + Data Stall: + rst: Percent of the :ref:`total CU cycles ` the address processor + was stalled from sending write/atomic data further into the vL1D pipeline + unit: Percent + "Data-Processor \u2192 Address Stall": + rst: Percent of :ref:`total CU cycles ` the address processor was + stalled waiting to send command data to the :ref:`data processor ` + unit: Percent + Total Instructions: + rst: The total number of memory instructions executed by the address processer + over all compute units on the accelerator, per normalization unit. + unit: Instructions per normalization unit + Global/Generic Instructions: + rst: The total number of global & generic memory instructions executed on all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + unit: Instructions per normalization unit + Global/Generic Read Instructions: + rst: The total number of global & generic memory read instructions executed on all + :doc:`compute units ` on the accelerator, per :ref:`normalization + unit `. + unit: Instructions per normalization unit + Global/Generic Write Instructions: + rst: The total number of global & generic memory write instructions executed on + all :doc:`compute units ` on the accelerator, per :ref:`normalization + unit `. + unit: Instructions per normalization unit + Global/Generic Atomic Instructions: + rst: The total number of global & generic memory atomic (with and without return) + instructions executed on all :doc:`compute units ` on the accelerator, + per :ref:`normalization unit `. + unit: Instructions per normalization unit + Spill/Stack Instructions: + rst: The total number of spill/stack memory instructions executed on all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + unit: Instructions per normalization unit + Spill/Stack Read Instructions: + rst: The total number of spill/stack memory read instructions executed on all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + unit: Instructions per normalization unit + Spill/Stack Write Instructions: + rst: The total number of spill/stack memory write instructions executed on all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + unit: Instructions per normalization unit + Spill/Stack Atomic Instructions: + rst: The total number of spill/stack memory atomic (with and without return) instructions + executed on all :doc:`compute units ` on the accelerator, per + :ref:`normalization unit `. Typically unused as these + memory operations are typically used to implement thread-local storage. + unit: Instructions per normalization unit + Spill/Stack Total Cycles: + rst: The number of cycles the address processing unit spent working on spill/stack + instructions, per :ref:`normalization unit `. + unit: Cycles per normalization unit + Spill/Stack Coalesced Read: + rst: The number of cycles the address processing unit spent working on coalesced + spill/stack read instructions, per :ref:`normalization unit `. + unit: Cycles per normalization unit + Spill/Stack Coalesced Write: + rst: The number of cycles the address processing unit spent working on coalesced + spill/stack write instructions, per :ref:`normalization unit `. + unit: Cycles per normalization unit + Data-Return Busy: + rst: Percent of the :ref:`total CU cycles ` the data-return unit + was busy processing or waiting on data to return to the :doc:`CU `. + unit: Percent + "Cache RAM \u2192 Data-Return Stall": + rst: Percent of the :ref:`total CU cycles ` the data-return unit + was stalled on data to be returned from the :ref:`vL1D Cache RAM `. + unit: Percent + "Workgroup manager \u2192 Data-Return Stall": + rst: Percent of the :ref:`total CU cycles ` the data-return unit + was stalled by the :ref:`workgroup manager ` due to initialization + of registers as a part of launching new workgroups. + unit: Percent + Coalescable Instructions: + rst: The number of instructions submitted to the :ref:`data-return unit ` + by the :ref:`address processor ` that were found to be coalescable, + per :ref:`normalization unit `. + unit: Instructions per normalization unit + Read Instructions: + rst: The number of read instructions submitted to the :ref:`data-return unit + ` by the :ref:`address processor ` summed over all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + This is expected to be the sum of global/generic and spill/stack reads in the :ref:`address + processor `. + unit: Instructions per normalization unit + Write Instructions: + rst: The number of store instructions submitted to the :ref:`data-return unit + ` by the :ref:`address processor ` summed over all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + This is expected to be the sum of global/generic and spill/stack stores counted + by the :ref:`vL1D cache-front-end `. + unit: Instructions per normalization unit + Atomic Instructions: + rst: The number of atomic instructions submitted to the :ref:`data-return unit + ` by the :ref:`address processor ` summed over all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + This is expected to be the sum of global/generic and spill/stack atomics in + the :ref:`address processor `. + unit: Instructions per normalization unit +L2 Speed-of-Light: + Utilization: + rst: The ratio of the :ref:`number of cycles an L2 channel was active, summed + over all L2 channels on the accelerator ` over the + :ref:`total L2 cycles `. + unit: Percent + Peak Bandwidth: + rst: The number of bytes looked up in the L2 cache, as a percent of the peak theoretical + bandwidth achievable on the specific accelerator. The number of bytes is calculated + as the number of cache lines requested multiplied by the cache line size. This + value does not consider partial requests, so e.g., if only a single value is + requested in a cache line, the data movement will still be counted as a full + cache line. + unit: Percent + Hit Rate: + rst: The ratio of the number of L2 cache line requests that hit in the L2 cache + over the total number of incoming cache line requests to the L2 cache. + unit: Percent + L2-Fabric Read BW: + rst: The number of bytes read by the L2 over the :ref:`Infinity Fabric interface + ` per unit time. + unit: GB/s + L2-Fabric Write and Atomic BW: + rst: The number of bytes sent by the L2 over the :ref:`Infinity Fabric interface + ` by write and atomic operations per unit time. + unit: GB/s + HBM Bandwidth: + rst: Maximum theoretical bandwidth of the accelerator's local high-bandwidth memory + (HBM) per unit time. This value is calculated as the number of HBM channels + multiplied by the HBM channel width multiplied by the HBM clock frequency. + unit: GB/s + Read BW: + rst: The total number of bytes read by the L2 cache from Infinity Fabric per :ref:`normalization + unit `. + unit: Bytes per normalization unit + HBM Read Traffic: + rst: The percent of read requests generated by the L2 cache that are routed to the + accelerator's local high-bandwidth memory (HBM). This breakdown does not consider + the *size* of the request (meaning that 32B and 64B requests are both counted + as a single request), so this metric only *approximates* the percent of the + L2-Fabric Read bandwidth directed to the local HBM. + unit: Percent + Remote Read Traffic: + rst: The percent of read requests generated by the L2 cache that are routed to any + memory location other than the accelerator's local high-bandwidth memory (HBM) + -- for example, the CPU's DRAM or a remote accelerator's HBM. This breakdown + does not consider the *size* of the request (meaning that 32B and 64B requests + are both counted as a single request), so this metric only *approximates* the + percent of the L2-Fabric Read bandwidth directed to a remote location. + unit: Percent + Uncached Read Traffic: + rst: The percent of read requests generated by the L2 cache that are reading from + an :ref:`uncached memory allocation `. Note, as described in the + :ref:`request flow ` section, a single 64B read request is + typically counted as two uncached read requests. So, it is possible for the + Uncached Read Traffic to reach up to 200% of the total number of read requests. + This breakdown does not consider the *size* of the request (i.e., 32B and 64B + requests are both counted as a single request), so this metric only *approximates* + the percent of the L2-Fabric read bandwidth directed to an uncached memory + location. + unit: Percent + Write and Atomic BW: + rst: The total number of bytes written by the L2 over Infinity Fabric by write and + atomic operations per :ref:`normalization unit `. Note + that on current CDNA accelerators, such as the :ref:`MI2XX `, requests + are only considered *atomic* by Infinity Fabric if they are targeted at non-write-cacheable + memory, for example, :ref:`fine-grained memory ` allocations or :ref:`uncached + memory ` allocations on the MI2XX. + unit: Bytes per normalization unit + HBM Write and Atomic Traffic: + rst: The percent of write and atomic requests generated by the L2 cache that are + routed to the accelerator's local high-bandwidth memory (HBM). This breakdown + does not consider the *size* of the request (meaning that 32B and 64B requests + are both counted as a single request), so this metric only *approximates* the + percent of the L2-Fabric Write and Atomic bandwidth directed to the local HBM. + Note that on current CDNA accelerators, such as the :ref:`MI2XX `, + requests are only considered *atomic* by Infinity Fabric if they are targeted + at :ref:`fine-grained memory ` allocations or :ref:`uncached + memory ` allocations. + unit: Percent + Remote Write and Atomic Traffic: + rst: The percent of read requests generated by the L2 cache that are routed to any + memory location other than the accelerator's local high-bandwidth memory (HBM) + -- for example, the CPU's DRAM or a remote accelerator's HBM. This breakdown + does not consider the *size* of the request (meaning that 32B and 64B requests + are both counted as a single request), so this metric only *approximates* the + percent of the L2-Fabric Read bandwidth directed to a remote location. Note + that on current CDNA accelerators, such as the :ref:`MI2XX `, requests + are only considered *atomic* by Infinity Fabric if they are targeted at :ref:`fine-grained + memory ` allocations or :ref:`uncached memory ` allocations. + unit: Percent + Atomic Traffic: + rst: The percent of write requests generated by the L2 cache that are atomic requests + to *any* memory location. This breakdown does not consider the *size* of the + request (meaning that 32B and 64B requests are both counted as a single request), + so this metric only *approximates* the percent of the L2-Fabric Read bandwidth + directed to a remote location. Note that on current CDNA accelerators, such + as the :ref:`MI2XX `, requests are only considered *atomic* by + Infinity Fabric if they are targeted at :ref:`fine-grained memory ` + allocations or :ref:`uncached memory ` allocations. + unit: Percent + Uncached Write and Atomic Traffic: + rst: The percent of write and atomic requests generated by the L2 cache that are + targeting :ref:`uncached memory allocations `. This breakdown + does not consider the *size* of the request (meaning that 32B and 64B requests + are both counted as a single request), so this metric only *approximates* the + percent of the L2-Fabric read bandwidth directed to uncached memory allocations. + unit: Percent + Read Latency: + rst: The time-averaged number of cycles read requests spent in Infinity Fabric before + data was returned to the L2. + unit: Cycles + Write and Atomic Latency: + rst: The time-averaged number of cycles write requests spent in Infinity Fabric + before a completion acknowledgement was returned to the L2. + unit: Cycles + Atomic Latency: + rst: The time-averaged number of cycles atomic requests spent in Infinity Fabric + before a completion acknowledgement (atomic without return value) or data (atomic + with return value) was returned to the L2. + unit: Cycles + Bandwidth: + rst: The number of bytes looked up in the L2 cache, per :ref:`normalization unit + `. The number of bytes is calculated as the number of + cache lines requested multiplied by the cache line size. This value does not + consider partial requests, so for example, if only a single value is requested + in a cache line, the data movement will still be counted as a full cache line. + unit: Bytes per normalization unit + Req: + rst: The total number of incoming requests to the L2 from all clients for all request + types, per :ref:`normalization unit `. + unit: Requests per normalization unit + Read Req: + rst: 'The total number of read requests to the L2 from all clients. ' + unit: Requests per normalization unit + Write Req: + rst: The total number of write requests to the L2 from all clients. + unit: Requests per normalization unit + Atomic Req: + rst: The total number of atomic requests (with and without return) to the L2 from + all clients. + unit: Requests per normalization unit + Streaming Req: + rst: The total number of incoming requests to the L2 that are marked as *streaming*. + The exact meaning of this may differ depending on the targeted accelerator, + however on an :ref:`MI2XX ` this corresponds to `non-temporal + load or stores `_. The + L2 cache attempts to evict *streaming* requests before normal requests when + the L2 is at capacity. + unit: Requests per normalization unit + Probe Req: + rst: The number of coherence probe requests made to the L2 cache from outside the + accelerator. On an :ref:`MI2XX `, probe requests may be generated + by, for example, writes to :ref:`fine-grained device ` memory + or by writes to :ref:`coarse-grained ` device memory. + unit: Requests per normalization unit + Cache Hit: + rst: The ratio of the number of L2 cache line requests that hit in the L2 cache + over the total number of incoming cache line requests to the L2 cache. + unit: Percent + Hits: + rst: The total number of requests to the L2 from all clients that hit in the cache. + As noted in the :ref:`Speed-of-Light ` section, this includes hit-on-miss + requests. + unit: Requests per normalization unit + Misses: + rst: The total number of requests to the L2 from all clients that miss in the cache. + As noted in the :ref:`Speed-of-Light ` section, these do not include + hit-on-miss requests. + unit: Requests per normalization unit + Writeback: + rst: The total number of L2 cache lines written back to memory for any reason. Write-backs + may occur due to user code (such as HIP kernel calls to ``__threadfence_system`` + or atomic built-ins) by the :doc:`command processor `'s + memory acquire/release fences, or for other internal hardware reasons. + unit: Cache lines per normalization unit + Writeback (Internal): + rst: The total number of L2 cache lines written back to memory for internal hardware + reasons, per :ref:`normalization unit `. + unit: Cache lines per normalization unit + Writeback (vL1D Req): + rst: The total number of L2 cache lines written back to memory due to requests initiated + by the :doc:`vL1D cache `, per :ref:`normalization unit `. + unit: Cache lines per normalization unit + Evict (Internal): + rst: The total number of L2 cache lines evicted from the cache due to capacity limits, + per :ref:`normalization unit `. + unit: Cache lines per normalization unit + Evict (vL1D Req): + rst: The total number of L2 cache lines evicted from the cache due to invalidation + requests initiated by the :doc:`vL1D cache `, per :ref:`normalization + unit `. + unit: Cache lines per normalization unit + NC Req: + rst: The total number of requests to the L2 to Not-hardware-Coherent (NC) memory + allocations, per :ref:`normalization unit `. See the :ref:`memory-type` + for more information. + unit: Requests per normalization unit + UC Req: + rst: The total number of requests to the L2 that go to Uncached (UC) memory allocations. + See the :ref:`memory-type` for more information. + unit: Requests per normalization unit + CC Req: + rst: The total number of requests to the L2 that go to Coherently Cacheable (CC) memory + allocations. See the :ref:`memory-type` for more information. + unit: Requests per normalization unit + RW Req: + rst: The total number of requests to the L2 that go to Read-Write coherent memory (RW) + allocations. See the :ref:`memory-type` for more information. + unit: Requests per normalization unit + Write - Credit Starvation: + rst: The number of cycles the L2-Fabric interface was stalled on write or atomic + requests to any memory location because too many write/atomic requests were + currently in flight, as a percent of the :ref:`total active L2 cycles `. + unit: Percent + Read (32B): + rst: The total number of L2 requests to Infinity Fabric to read 32B of data from + any memory location, per :ref:`normalization unit `. See :ref:`l2-request-flow` + for more detail. Typically unused on CDNA accelerators. + unit: Requests per normalization unit + Read (64B): + rst: The total number of L2 requests to Infinity Fabric to read 64B of data from + any memory location, per :ref:`normalization unit `. See :ref:`l2-request-flow` + for more detail. + unit: Requests per normalization unit + Read (Uncached): + rst: The total number of L2 requests to Infinity Fabric to read :ref:`uncached + data ` from any memory location, per :ref:`normalization unit + `. 64B requests for uncached data are counted as two 32B + uncached data requests. See :ref:`l2-request-flow` for more detail. + unit: Requests per normalization unit + HBM Read: + rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of data + from the accelerator's local HBM, per :ref:`normalization unit `. + See :ref:`l2-request-flow` for more detail. + unit: Requests per normalization unit + Remote Read: + rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of data + from any source other than the accelerator's local HBM, per :ref:`normalization + unit `. See :ref:`l2-request-flow` for more detail. + unit: Requests per normalization unit + Write and Atomic (32B): + rst: The total number of L2 requests to Infinity Fabric to write or atomically update + 32B of data to any memory location, per :ref:`normalization unit `. + See :ref:`l2-request-flow` for more detail. + unit: Requests per normalization unit + Write and Atomic (Uncached): + rst: The total number of L2 requests to Infinity Fabric to write or atomically update + 32B or 64B of :ref:`uncached data `, per :ref:`normalization unit + `. See :ref:`l2-request-flow` for more detail. + unit: Requests per normalization unit + Write and Atomic (64B): + rst: The total number of L2 requests to Infinity Fabric to write or atomically update + 64B of data in any memory location, per :ref:`normalization unit `. + See :ref:`l2-request-flow` for more detail. + unit: Requests per normalization unit + HBM Write and Atomic: + rst: The total number of L2 requests to Infinity Fabric to write or atomically update + 32B or 64B of data in the accelerator's local HBM, per :ref:`normalization + unit `. See :ref:`l2-request-flow` for more detail. plain + unit: Requests per normalization unit + Remote Write and Atomic: + rst: The total number of L2 requests to Infinity Fabric to write or atomically update + 32B or 64B of data in any memory location other than the accelerator's local + HBM, per :ref:`normalization unit `. See :ref:`l2-request-flow` + for more detail. + unit: Requests per normalization unit + Atomic: + rst: The total number of L2 requests to Infinity Fabric to atomically update 32B + or 64B of data in any memory location, per :ref:`normalization unit `. + See :ref:`l2-request-flow` for more detail. Note that on current CDNA accelerators, + such as the :ref:`MI2XX `, requests are only considered *atomic* + by Infinity Fabric if they are targeted at non-write-cacheable memory, such + as :ref:`fine-grained memory ` allocations or :ref:`uncached + memory ` allocations on the MI2XX. + unit: Requests per normalization unit + Read Stall: + rst: "The ratio of the total number of cycles the L2-Fabric interface was stalled\ + \ on a read request to any destination (local HBM, remote PCIe\xAE connected\ + \ accelerator or CPU, or remote Infinity Fabric connected accelerator [#inf]_\ + \ or CPU) over the :ref:`total active L2 cycles `." + unit: Percent + Write Stall: + rst: The ratio of the total number of cycles the L2-Fabric interface was stalled + on a write or atomic request to any destination (local HBM, remote accelerator + or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected + accelerator [#inf]_ or CPU) over the :ref:`total active L2 cycles `. + unit: Percent + Read - PCIe Stall: + rst: The number of cycles the L2-Fabric interface was stalled on read requests + to remote PCIe connected accelerators [#inf]_ or CPUs as a percent of the :ref:`total + active L2 cycles `. + unit: Percent + Read - Infinity Fabric Stall: + rst: The number of cycles the L2-Fabric interface was stalled on read requests + to remote Infinity Fabric connected accelerators [#inf]_ or CPUs as a percent + of the :ref:`total active L2 cycles `. + unit: Percent + Read - HBM Stall: + rst: The number of cycles the L2-Fabric interface was stalled on read requests + to the accelerator's local HBM as a percent of the :ref:`total active L2 cycles + `. + unit: Percent + Write - PCIe Stall: + rst: The number of cycles the L2-Fabric interface was stalled on write or atomic + requests to remote PCIe connected accelerators [#inf]_ or CPUs as a percent + of the :ref:`total active L2 cycles `. + unit: Percent + Write - Infinity Fabric Stall: + rst: The number of cycles the L2-Fabric interface was stalled on write or atomic + requests to remote Infinity Fabric connected accelerators [#inf]_ or CPUs as + a percent of the :ref:`total active L2 cycles `. + unit: Percent + Write - HBM Stall: + rst: The number of cycles the L2-Fabric interface was stalled on write or atomic + requests to accelerator's local HBM as a percent of the total active L2 cycles. + unit: Percent +L2 cache accesses: + Utilization: + rst: The ratio of the :ref:`number of cycles an L2 channel was active, summed + over all L2 channels on the accelerator ` over the + :ref:`total L2 cycles `. + unit: Percent + Peak Bandwidth: + rst: The number of bytes looked up in the L2 cache, as a percent of the peak theoretical + bandwidth achievable on the specific accelerator. The number of bytes is calculated + as the number of cache lines requested multiplied by the cache line size. This + value does not consider partial requests, so e.g., if only a single value is + requested in a cache line, the data movement will still be counted as a full + cache line. + unit: Percent + Hit Rate: + rst: The ratio of the number of L2 cache line requests that hit in the L2 cache + over the total number of incoming cache line requests to the L2 cache. + unit: Percent + L2-Fabric Read BW: + rst: The number of bytes read by the L2 over the :ref:`Infinity Fabric interface + ` per unit time. + unit: GB/s + L2-Fabric Write and Atomic BW: + rst: The number of bytes sent by the L2 over the :ref:`Infinity Fabric interface + ` by write and atomic operations per unit time. + unit: GB/s + HBM Bandwidth: + rst: Maximum theoretical bandwidth of the accelerator's local high-bandwidth memory + (HBM) per unit time. This value is calculated as the number of HBM channels + multiplied by the HBM channel width multiplied by the HBM clock frequency. + unit: GB/s + Read BW: + rst: The total number of bytes read by the L2 cache from Infinity Fabric per :ref:`normalization + unit `. + unit: Bytes per normalization unit + HBM Read Traffic: + rst: The percent of read requests generated by the L2 cache that are routed to the + accelerator's local high-bandwidth memory (HBM). This breakdown does not consider + the *size* of the request (meaning that 32B and 64B requests are both counted + as a single request), so this metric only *approximates* the percent of the + L2-Fabric Read bandwidth directed to the local HBM. + unit: Percent + Remote Read Traffic: + rst: The percent of read requests generated by the L2 cache that are routed to any + memory location other than the accelerator's local high-bandwidth memory (HBM) + -- for example, the CPU's DRAM or a remote accelerator's HBM. This breakdown + does not consider the *size* of the request (meaning that 32B and 64B requests + are both counted as a single request), so this metric only *approximates* the + percent of the L2-Fabric Read bandwidth directed to a remote location. + unit: Percent + Uncached Read Traffic: + rst: The percent of read requests generated by the L2 cache that are reading from + an :ref:`uncached memory allocation `. Note, as described in the + :ref:`request flow ` section, a single 64B read request is + typically counted as two uncached read requests. So, it is possible for the + Uncached Read Traffic to reach up to 200% of the total number of read requests. + This breakdown does not consider the *size* of the request (i.e., 32B and 64B + requests are both counted as a single request), so this metric only *approximates* + the percent of the L2-Fabric read bandwidth directed to an uncached memory + location. + unit: Percent + Write and Atomic BW: + rst: The total number of bytes written by the L2 over Infinity Fabric by write and + atomic operations per :ref:`normalization unit `. Note + that on current CDNA accelerators, such as the :ref:`MI2XX `, requests + are only considered *atomic* by Infinity Fabric if they are targeted at non-write-cacheable + memory, for example, :ref:`fine-grained memory ` allocations or :ref:`uncached + memory ` allocations on the MI2XX. + unit: Bytes per normalization unit + HBM Write and Atomic Traffic: + rst: The percent of write and atomic requests generated by the L2 cache that are + routed to the accelerator's local high-bandwidth memory (HBM). This breakdown + does not consider the *size* of the request (meaning that 32B and 64B requests + are both counted as a single request), so this metric only *approximates* the + percent of the L2-Fabric Write and Atomic bandwidth directed to the local HBM. + Note that on current CDNA accelerators, such as the :ref:`MI2XX `, + requests are only considered *atomic* by Infinity Fabric if they are targeted + at :ref:`fine-grained memory ` allocations or :ref:`uncached + memory ` allocations. + unit: Percent + Remote Write and Atomic Traffic: + rst: The percent of read requests generated by the L2 cache that are routed to any + memory location other than the accelerator's local high-bandwidth memory (HBM) + -- for example, the CPU's DRAM or a remote accelerator's HBM. This breakdown + does not consider the *size* of the request (meaning that 32B and 64B requests + are both counted as a single request), so this metric only *approximates* the + percent of the L2-Fabric Read bandwidth directed to a remote location. Note + that on current CDNA accelerators, such as the :ref:`MI2XX `, requests + are only considered *atomic* by Infinity Fabric if they are targeted at :ref:`fine-grained + memory ` allocations or :ref:`uncached memory ` allocations. + unit: Percent + Atomic Traffic: + rst: The percent of write requests generated by the L2 cache that are atomic requests + to *any* memory location. This breakdown does not consider the *size* of the + request (meaning that 32B and 64B requests are both counted as a single request), + so this metric only *approximates* the percent of the L2-Fabric Read bandwidth + directed to a remote location. Note that on current CDNA accelerators, such + as the :ref:`MI2XX `, requests are only considered *atomic* by + Infinity Fabric if they are targeted at :ref:`fine-grained memory ` + allocations or :ref:`uncached memory ` allocations. + unit: Percent + Uncached Write and Atomic Traffic: + rst: The percent of write and atomic requests generated by the L2 cache that are + targeting :ref:`uncached memory allocations `. This breakdown + does not consider the *size* of the request (meaning that 32B and 64B requests + are both counted as a single request), so this metric only *approximates* the + percent of the L2-Fabric read bandwidth directed to uncached memory allocations. + unit: Percent + Read Latency: + rst: The time-averaged number of cycles read requests spent in Infinity Fabric before + data was returned to the L2. + unit: Cycles + Write and Atomic Latency: + rst: The time-averaged number of cycles write requests spent in Infinity Fabric + before a completion acknowledgement was returned to the L2. + unit: Cycles + Atomic Latency: + rst: The time-averaged number of cycles atomic requests spent in Infinity Fabric + before a completion acknowledgement (atomic without return value) or data (atomic + with return value) was returned to the L2. + unit: Cycles + Bandwidth: + rst: The number of bytes looked up in the L2 cache, per :ref:`normalization unit + `. The number of bytes is calculated as the number of + cache lines requested multiplied by the cache line size. This value does not + consider partial requests, so for example, if only a single value is requested + in a cache line, the data movement will still be counted as a full cache line. + unit: Bytes per normalization unit + Req: + rst: The total number of incoming requests to the L2 from all clients for all request + types, per :ref:`normalization unit `. + unit: Requests per normalization unit + Read Req: + rst: 'The total number of read requests to the L2 from all clients. ' + unit: Requests per normalization unit + Write Req: + rst: The total number of write requests to the L2 from all clients. + unit: Requests per normalization unit + Atomic Req: + rst: The total number of atomic requests (with and without return) to the L2 from + all clients. + unit: Requests per normalization unit + Streaming Req: + rst: The total number of incoming requests to the L2 that are marked as *streaming*. + The exact meaning of this may differ depending on the targeted accelerator, + however on an :ref:`MI2XX ` this corresponds to `non-temporal + load or stores `_. The + L2 cache attempts to evict *streaming* requests before normal requests when + the L2 is at capacity. + unit: Requests per normalization unit + Probe Req: + rst: The number of coherence probe requests made to the L2 cache from outside the + accelerator. On an :ref:`MI2XX `, probe requests may be generated + by, for example, writes to :ref:`fine-grained device ` memory + or by writes to :ref:`coarse-grained ` device memory. + unit: Requests per normalization unit + Cache Hit: + rst: The ratio of the number of L2 cache line requests that hit in the L2 cache + over the total number of incoming cache line requests to the L2 cache. + unit: Percent + Hits: + rst: The total number of requests to the L2 from all clients that hit in the cache. + As noted in the :ref:`Speed-of-Light ` section, this includes hit-on-miss + requests. + unit: Requests per normalization unit + Misses: + rst: The total number of requests to the L2 from all clients that miss in the cache. + As noted in the :ref:`Speed-of-Light ` section, these do not include + hit-on-miss requests. + unit: Requests per normalization unit + Writeback: + rst: The total number of L2 cache lines written back to memory for any reason. Write-backs + may occur due to user code (such as HIP kernel calls to ``__threadfence_system`` + or atomic built-ins) by the :doc:`command processor `'s + memory acquire/release fences, or for other internal hardware reasons. + unit: Cache lines per normalization unit + Writeback (Internal): + rst: The total number of L2 cache lines written back to memory for internal hardware + reasons, per :ref:`normalization unit `. + unit: Cache lines per normalization unit + Writeback (vL1D Req): + rst: The total number of L2 cache lines written back to memory due to requests initiated + by the :doc:`vL1D cache `, per :ref:`normalization unit `. + unit: Cache lines per normalization unit + Evict (Internal): + rst: The total number of L2 cache lines evicted from the cache due to capacity limits, + per :ref:`normalization unit `. + unit: Cache lines per normalization unit + Evict (vL1D Req): + rst: The total number of L2 cache lines evicted from the cache due to invalidation + requests initiated by the :doc:`vL1D cache `, per :ref:`normalization + unit `. + unit: Cache lines per normalization unit + NC Req: + rst: The total number of requests to the L2 to Not-hardware-Coherent (NC) memory + allocations, per :ref:`normalization unit `. See the :ref:`memory-type` + for more information. + unit: Requests per normalization unit + UC Req: + rst: The total number of requests to the L2 that go to Uncached (UC) memory allocations. + See the :ref:`memory-type` for more information. + unit: Requests per normalization unit + CC Req: + rst: The total number of requests to the L2 that go to Coherently Cacheable (CC) memory + allocations. See the :ref:`memory-type` for more information. + unit: Requests per normalization unit + RW Req: + rst: The total number of requests to the L2 that go to Read-Write coherent memory (RW) + allocations. See the :ref:`memory-type` for more information. + unit: Requests per normalization unit + Write - Credit Starvation: + rst: The number of cycles the L2-Fabric interface was stalled on write or atomic + requests to any memory location because too many write/atomic requests were + currently in flight, as a percent of the :ref:`total active L2 cycles `. + unit: Percent + Read (32B): + rst: The total number of L2 requests to Infinity Fabric to read 32B of data from + any memory location, per :ref:`normalization unit `. See :ref:`l2-request-flow` + for more detail. Typically unused on CDNA accelerators. + unit: Requests per normalization unit + Read (64B): + rst: The total number of L2 requests to Infinity Fabric to read 64B of data from + any memory location, per :ref:`normalization unit `. See :ref:`l2-request-flow` + for more detail. + unit: Requests per normalization unit + Read (Uncached): + rst: The total number of L2 requests to Infinity Fabric to read :ref:`uncached + data ` from any memory location, per :ref:`normalization unit + `. 64B requests for uncached data are counted as two 32B + uncached data requests. See :ref:`l2-request-flow` for more detail. + unit: Requests per normalization unit + HBM Read: + rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of data + from the accelerator's local HBM, per :ref:`normalization unit `. + See :ref:`l2-request-flow` for more detail. + unit: Requests per normalization unit + Remote Read: + rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of data + from any source other than the accelerator's local HBM, per :ref:`normalization + unit `. See :ref:`l2-request-flow` for more detail. + unit: Requests per normalization unit + Write and Atomic (32B): + rst: The total number of L2 requests to Infinity Fabric to write or atomically update + 32B of data to any memory location, per :ref:`normalization unit `. + See :ref:`l2-request-flow` for more detail. + unit: Requests per normalization unit + Write and Atomic (Uncached): + rst: The total number of L2 requests to Infinity Fabric to write or atomically update + 32B or 64B of :ref:`uncached data `, per :ref:`normalization unit + `. See :ref:`l2-request-flow` for more detail. + unit: Requests per normalization unit + Write and Atomic (64B): + rst: The total number of L2 requests to Infinity Fabric to write or atomically update + 64B of data in any memory location, per :ref:`normalization unit `. + See :ref:`l2-request-flow` for more detail. + unit: Requests per normalization unit + HBM Write and Atomic: + rst: The total number of L2 requests to Infinity Fabric to write or atomically update + 32B or 64B of data in the accelerator's local HBM, per :ref:`normalization + unit `. See :ref:`l2-request-flow` for more detail. plain + unit: Requests per normalization unit + Remote Write and Atomic: + rst: The total number of L2 requests to Infinity Fabric to write or atomically update + 32B or 64B of data in any memory location other than the accelerator's local + HBM, per :ref:`normalization unit `. See :ref:`l2-request-flow` + for more detail. + unit: Requests per normalization unit + Atomic: + rst: The total number of L2 requests to Infinity Fabric to atomically update 32B + or 64B of data in any memory location, per :ref:`normalization unit `. + See :ref:`l2-request-flow` for more detail. Note that on current CDNA accelerators, + such as the :ref:`MI2XX `, requests are only considered *atomic* + by Infinity Fabric if they are targeted at non-write-cacheable memory, such + as :ref:`fine-grained memory ` allocations or :ref:`uncached + memory ` allocations on the MI2XX. + unit: Requests per normalization unit + Read Stall: + rst: "The ratio of the total number of cycles the L2-Fabric interface was stalled\ + \ on a read request to any destination (local HBM, remote PCIe\xAE connected\ + \ accelerator or CPU, or remote Infinity Fabric connected accelerator [#inf]_\ + \ or CPU) over the :ref:`total active L2 cycles `." + unit: Percent + Write Stall: + rst: The ratio of the total number of cycles the L2-Fabric interface was stalled + on a write or atomic request to any destination (local HBM, remote accelerator + or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected + accelerator [#inf]_ or CPU) over the :ref:`total active L2 cycles `. + unit: Percent + Read - PCIe Stall: + rst: The number of cycles the L2-Fabric interface was stalled on read requests + to remote PCIe connected accelerators [#inf]_ or CPUs as a percent of the :ref:`total + active L2 cycles `. + unit: Percent + Read - Infinity Fabric Stall: + rst: The number of cycles the L2-Fabric interface was stalled on read requests + to remote Infinity Fabric connected accelerators [#inf]_ or CPUs as a percent + of the :ref:`total active L2 cycles `. + unit: Percent + Read - HBM Stall: + rst: The number of cycles the L2-Fabric interface was stalled on read requests + to the accelerator's local HBM as a percent of the :ref:`total active L2 cycles + `. + unit: Percent + Write - PCIe Stall: + rst: The number of cycles the L2-Fabric interface was stalled on write or atomic + requests to remote PCIe connected accelerators [#inf]_ or CPUs as a percent + of the :ref:`total active L2 cycles `. + unit: Percent + Write - Infinity Fabric Stall: + rst: The number of cycles the L2-Fabric interface was stalled on write or atomic + requests to remote Infinity Fabric connected accelerators [#inf]_ or CPUs as + a percent of the :ref:`total active L2 cycles `. + unit: Percent + Write - HBM Stall: + rst: The number of cycles the L2-Fabric interface was stalled on write or atomic + requests to accelerator's local HBM as a percent of the total active L2 cycles. + unit: Percent +L2-Fabric interface metrics: + Utilization: + rst: The ratio of the :ref:`number of cycles an L2 channel was active, summed + over all L2 channels on the accelerator ` over the + :ref:`total L2 cycles `. + unit: Percent + Peak Bandwidth: + rst: The number of bytes looked up in the L2 cache, as a percent of the peak theoretical + bandwidth achievable on the specific accelerator. The number of bytes is calculated + as the number of cache lines requested multiplied by the cache line size. This + value does not consider partial requests, so e.g., if only a single value is + requested in a cache line, the data movement will still be counted as a full + cache line. + unit: Percent + Hit Rate: + rst: The ratio of the number of L2 cache line requests that hit in the L2 cache + over the total number of incoming cache line requests to the L2 cache. + unit: Percent + L2-Fabric Read BW: + rst: The number of bytes read by the L2 over the :ref:`Infinity Fabric interface + ` per unit time. + unit: GB/s + L2-Fabric Write and Atomic BW: + rst: The number of bytes sent by the L2 over the :ref:`Infinity Fabric interface + ` by write and atomic operations per unit time. + unit: GB/s + HBM Bandwidth: + rst: Maximum theoretical bandwidth of the accelerator's local high-bandwidth memory + (HBM) per unit time. This value is calculated as the number of HBM channels + multiplied by the HBM channel width multiplied by the HBM clock frequency. + unit: GB/s + Read BW: + rst: The total number of bytes read by the L2 cache from Infinity Fabric per :ref:`normalization + unit `. + unit: Bytes per normalization unit + HBM Read Traffic: + rst: The percent of read requests generated by the L2 cache that are routed to the + accelerator's local high-bandwidth memory (HBM). This breakdown does not consider + the *size* of the request (meaning that 32B and 64B requests are both counted + as a single request), so this metric only *approximates* the percent of the + L2-Fabric Read bandwidth directed to the local HBM. + unit: Percent + Remote Read Traffic: + rst: The percent of read requests generated by the L2 cache that are routed to any + memory location other than the accelerator's local high-bandwidth memory (HBM) + -- for example, the CPU's DRAM or a remote accelerator's HBM. This breakdown + does not consider the *size* of the request (meaning that 32B and 64B requests + are both counted as a single request), so this metric only *approximates* the + percent of the L2-Fabric Read bandwidth directed to a remote location. + unit: Percent + Uncached Read Traffic: + rst: The percent of read requests generated by the L2 cache that are reading from + an :ref:`uncached memory allocation `. Note, as described in the + :ref:`request flow ` section, a single 64B read request is + typically counted as two uncached read requests. So, it is possible for the + Uncached Read Traffic to reach up to 200% of the total number of read requests. + This breakdown does not consider the *size* of the request (i.e., 32B and 64B + requests are both counted as a single request), so this metric only *approximates* + the percent of the L2-Fabric read bandwidth directed to an uncached memory + location. + unit: Percent + Write and Atomic BW: + rst: The total number of bytes written by the L2 over Infinity Fabric by write and + atomic operations per :ref:`normalization unit `. Note + that on current CDNA accelerators, such as the :ref:`MI2XX `, requests + are only considered *atomic* by Infinity Fabric if they are targeted at non-write-cacheable + memory, for example, :ref:`fine-grained memory ` allocations or :ref:`uncached + memory ` allocations on the MI2XX. + unit: Bytes per normalization unit + HBM Write and Atomic Traffic: + rst: The percent of write and atomic requests generated by the L2 cache that are + routed to the accelerator's local high-bandwidth memory (HBM). This breakdown + does not consider the *size* of the request (meaning that 32B and 64B requests + are both counted as a single request), so this metric only *approximates* the + percent of the L2-Fabric Write and Atomic bandwidth directed to the local HBM. + Note that on current CDNA accelerators, such as the :ref:`MI2XX `, + requests are only considered *atomic* by Infinity Fabric if they are targeted + at :ref:`fine-grained memory ` allocations or :ref:`uncached + memory ` allocations. + unit: Percent + Remote Write and Atomic Traffic: + rst: The percent of read requests generated by the L2 cache that are routed to any + memory location other than the accelerator's local high-bandwidth memory (HBM) + -- for example, the CPU's DRAM or a remote accelerator's HBM. This breakdown + does not consider the *size* of the request (meaning that 32B and 64B requests + are both counted as a single request), so this metric only *approximates* the + percent of the L2-Fabric Read bandwidth directed to a remote location. Note + that on current CDNA accelerators, such as the :ref:`MI2XX `, requests + are only considered *atomic* by Infinity Fabric if they are targeted at :ref:`fine-grained + memory ` allocations or :ref:`uncached memory ` allocations. + unit: Percent + Atomic Traffic: + rst: The percent of write requests generated by the L2 cache that are atomic requests + to *any* memory location. This breakdown does not consider the *size* of the + request (meaning that 32B and 64B requests are both counted as a single request), + so this metric only *approximates* the percent of the L2-Fabric Read bandwidth + directed to a remote location. Note that on current CDNA accelerators, such + as the :ref:`MI2XX `, requests are only considered *atomic* by + Infinity Fabric if they are targeted at :ref:`fine-grained memory ` + allocations or :ref:`uncached memory ` allocations. + unit: Percent + Uncached Write and Atomic Traffic: + rst: The percent of write and atomic requests generated by the L2 cache that are + targeting :ref:`uncached memory allocations `. This breakdown + does not consider the *size* of the request (meaning that 32B and 64B requests + are both counted as a single request), so this metric only *approximates* the + percent of the L2-Fabric read bandwidth directed to uncached memory allocations. + unit: Percent + Read Latency: + rst: The time-averaged number of cycles read requests spent in Infinity Fabric before + data was returned to the L2. + unit: Cycles + Write and Atomic Latency: + rst: The time-averaged number of cycles write requests spent in Infinity Fabric + before a completion acknowledgement was returned to the L2. + unit: Cycles + Atomic Latency: + rst: The time-averaged number of cycles atomic requests spent in Infinity Fabric + before a completion acknowledgement (atomic without return value) or data (atomic + with return value) was returned to the L2. + unit: Cycles + Bandwidth: + rst: The number of bytes looked up in the L2 cache, per :ref:`normalization unit + `. The number of bytes is calculated as the number of + cache lines requested multiplied by the cache line size. This value does not + consider partial requests, so for example, if only a single value is requested + in a cache line, the data movement will still be counted as a full cache line. + unit: Bytes per normalization unit + Req: + rst: The total number of incoming requests to the L2 from all clients for all request + types, per :ref:`normalization unit `. + unit: Requests per normalization unit + Read Req: + rst: 'The total number of read requests to the L2 from all clients. ' + unit: Requests per normalization unit + Write Req: + rst: The total number of write requests to the L2 from all clients. + unit: Requests per normalization unit + Atomic Req: + rst: The total number of atomic requests (with and without return) to the L2 from + all clients. + unit: Requests per normalization unit + Streaming Req: + rst: The total number of incoming requests to the L2 that are marked as *streaming*. + The exact meaning of this may differ depending on the targeted accelerator, + however on an :ref:`MI2XX ` this corresponds to `non-temporal + load or stores `_. The + L2 cache attempts to evict *streaming* requests before normal requests when + the L2 is at capacity. + unit: Requests per normalization unit + Probe Req: + rst: The number of coherence probe requests made to the L2 cache from outside the + accelerator. On an :ref:`MI2XX `, probe requests may be generated + by, for example, writes to :ref:`fine-grained device ` memory + or by writes to :ref:`coarse-grained ` device memory. + unit: Requests per normalization unit + Cache Hit: + rst: The ratio of the number of L2 cache line requests that hit in the L2 cache + over the total number of incoming cache line requests to the L2 cache. + unit: Percent + Hits: + rst: The total number of requests to the L2 from all clients that hit in the cache. + As noted in the :ref:`Speed-of-Light ` section, this includes hit-on-miss + requests. + unit: Requests per normalization unit + Misses: + rst: The total number of requests to the L2 from all clients that miss in the cache. + As noted in the :ref:`Speed-of-Light ` section, these do not include + hit-on-miss requests. + unit: Requests per normalization unit + Writeback: + rst: The total number of L2 cache lines written back to memory for any reason. Write-backs + may occur due to user code (such as HIP kernel calls to ``__threadfence_system`` + or atomic built-ins) by the :doc:`command processor `'s + memory acquire/release fences, or for other internal hardware reasons. + unit: Cache lines per normalization unit + Writeback (Internal): + rst: The total number of L2 cache lines written back to memory for internal hardware + reasons, per :ref:`normalization unit `. + unit: Cache lines per normalization unit + Writeback (vL1D Req): + rst: The total number of L2 cache lines written back to memory due to requests initiated + by the :doc:`vL1D cache `, per :ref:`normalization unit `. + unit: Cache lines per normalization unit + Evict (Internal): + rst: The total number of L2 cache lines evicted from the cache due to capacity limits, + per :ref:`normalization unit `. + unit: Cache lines per normalization unit + Evict (vL1D Req): + rst: The total number of L2 cache lines evicted from the cache due to invalidation + requests initiated by the :doc:`vL1D cache `, per :ref:`normalization + unit `. + unit: Cache lines per normalization unit + NC Req: + rst: The total number of requests to the L2 to Not-hardware-Coherent (NC) memory + allocations, per :ref:`normalization unit `. See the :ref:`memory-type` + for more information. + unit: Requests per normalization unit + UC Req: + rst: The total number of requests to the L2 that go to Uncached (UC) memory allocations. + See the :ref:`memory-type` for more information. + unit: Requests per normalization unit + CC Req: + rst: The total number of requests to the L2 that go to Coherently Cacheable (CC) memory + allocations. See the :ref:`memory-type` for more information. + unit: Requests per normalization unit + RW Req: + rst: The total number of requests to the L2 that go to Read-Write coherent memory (RW) + allocations. See the :ref:`memory-type` for more information. + unit: Requests per normalization unit + Write - Credit Starvation: + rst: The number of cycles the L2-Fabric interface was stalled on write or atomic + requests to any memory location because too many write/atomic requests were + currently in flight, as a percent of the :ref:`total active L2 cycles `. + unit: Percent + Read (32B): + rst: The total number of L2 requests to Infinity Fabric to read 32B of data from + any memory location, per :ref:`normalization unit `. See :ref:`l2-request-flow` + for more detail. Typically unused on CDNA accelerators. + unit: Requests per normalization unit + Read (64B): + rst: The total number of L2 requests to Infinity Fabric to read 64B of data from + any memory location, per :ref:`normalization unit `. See :ref:`l2-request-flow` + for more detail. + unit: Requests per normalization unit + Read (Uncached): + rst: The total number of L2 requests to Infinity Fabric to read :ref:`uncached + data ` from any memory location, per :ref:`normalization unit + `. 64B requests for uncached data are counted as two 32B + uncached data requests. See :ref:`l2-request-flow` for more detail. + unit: Requests per normalization unit + HBM Read: + rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of data + from the accelerator's local HBM, per :ref:`normalization unit `. + See :ref:`l2-request-flow` for more detail. + unit: Requests per normalization unit + Remote Read: + rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of data + from any source other than the accelerator's local HBM, per :ref:`normalization + unit `. See :ref:`l2-request-flow` for more detail. + unit: Requests per normalization unit + Write and Atomic (32B): + rst: The total number of L2 requests to Infinity Fabric to write or atomically update + 32B of data to any memory location, per :ref:`normalization unit `. + See :ref:`l2-request-flow` for more detail. + unit: Requests per normalization unit + Write and Atomic (Uncached): + rst: The total number of L2 requests to Infinity Fabric to write or atomically update + 32B or 64B of :ref:`uncached data `, per :ref:`normalization unit + `. See :ref:`l2-request-flow` for more detail. + unit: Requests per normalization unit + Write and Atomic (64B): + rst: The total number of L2 requests to Infinity Fabric to write or atomically update + 64B of data in any memory location, per :ref:`normalization unit `. + See :ref:`l2-request-flow` for more detail. + unit: Requests per normalization unit + HBM Write and Atomic: + rst: The total number of L2 requests to Infinity Fabric to write or atomically update + 32B or 64B of data in the accelerator's local HBM, per :ref:`normalization + unit `. See :ref:`l2-request-flow` for more detail. plain + unit: Requests per normalization unit + Remote Write and Atomic: + rst: The total number of L2 requests to Infinity Fabric to write or atomically update + 32B or 64B of data in any memory location other than the accelerator's local + HBM, per :ref:`normalization unit `. See :ref:`l2-request-flow` + for more detail. + unit: Requests per normalization unit + Atomic: + rst: The total number of L2 requests to Infinity Fabric to atomically update 32B + or 64B of data in any memory location, per :ref:`normalization unit `. + See :ref:`l2-request-flow` for more detail. Note that on current CDNA accelerators, + such as the :ref:`MI2XX `, requests are only considered *atomic* + by Infinity Fabric if they are targeted at non-write-cacheable memory, such + as :ref:`fine-grained memory ` allocations or :ref:`uncached + memory ` allocations on the MI2XX. + unit: Requests per normalization unit + Read Stall: + rst: "The ratio of the total number of cycles the L2-Fabric interface was stalled\ + \ on a read request to any destination (local HBM, remote PCIe\xAE connected\ + \ accelerator or CPU, or remote Infinity Fabric connected accelerator [#inf]_\ + \ or CPU) over the :ref:`total active L2 cycles `." + unit: Percent + Write Stall: + rst: The ratio of the total number of cycles the L2-Fabric interface was stalled + on a write or atomic request to any destination (local HBM, remote accelerator + or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected + accelerator [#inf]_ or CPU) over the :ref:`total active L2 cycles `. + unit: Percent + Read - PCIe Stall: + rst: The number of cycles the L2-Fabric interface was stalled on read requests + to remote PCIe connected accelerators [#inf]_ or CPUs as a percent of the :ref:`total + active L2 cycles `. + unit: Percent + Read - Infinity Fabric Stall: + rst: The number of cycles the L2-Fabric interface was stalled on read requests + to remote Infinity Fabric connected accelerators [#inf]_ or CPUs as a percent + of the :ref:`total active L2 cycles `. + unit: Percent + Read - HBM Stall: + rst: The number of cycles the L2-Fabric interface was stalled on read requests + to the accelerator's local HBM as a percent of the :ref:`total active L2 cycles + `. + unit: Percent + Write - PCIe Stall: + rst: The number of cycles the L2-Fabric interface was stalled on write or atomic + requests to remote PCIe connected accelerators [#inf]_ or CPUs as a percent + of the :ref:`total active L2 cycles `. + unit: Percent + Write - Infinity Fabric Stall: + rst: The number of cycles the L2-Fabric interface was stalled on write or atomic + requests to remote Infinity Fabric connected accelerators [#inf]_ or CPUs as + a percent of the :ref:`total active L2 cycles `. + unit: Percent + Write - HBM Stall: + rst: The number of cycles the L2-Fabric interface was stalled on write or atomic + requests to accelerator's local HBM as a percent of the total active L2 cycles. + unit: Percent +L2 - Fabric interface detailed metrics: + Utilization: + rst: The ratio of the :ref:`number of cycles an L2 channel was active, summed + over all L2 channels on the accelerator ` over the + :ref:`total L2 cycles `. + unit: Percent + Peak Bandwidth: + rst: The number of bytes looked up in the L2 cache, as a percent of the peak theoretical + bandwidth achievable on the specific accelerator. The number of bytes is calculated + as the number of cache lines requested multiplied by the cache line size. This + value does not consider partial requests, so e.g., if only a single value is + requested in a cache line, the data movement will still be counted as a full + cache line. + unit: Percent + Hit Rate: + rst: The ratio of the number of L2 cache line requests that hit in the L2 cache + over the total number of incoming cache line requests to the L2 cache. + unit: Percent + L2-Fabric Read BW: + rst: The number of bytes read by the L2 over the :ref:`Infinity Fabric interface + ` per unit time. + unit: GB/s + L2-Fabric Write and Atomic BW: + rst: The number of bytes sent by the L2 over the :ref:`Infinity Fabric interface + ` by write and atomic operations per unit time. + unit: GB/s + HBM Bandwidth: + rst: Maximum theoretical bandwidth of the accelerator's local high-bandwidth memory + (HBM) per unit time. This value is calculated as the number of HBM channels + multiplied by the HBM channel width multiplied by the HBM clock frequency. + unit: GB/s + Read BW: + rst: The total number of bytes read by the L2 cache from Infinity Fabric per :ref:`normalization + unit `. + unit: Bytes per normalization unit + HBM Read Traffic: + rst: The percent of read requests generated by the L2 cache that are routed to the + accelerator's local high-bandwidth memory (HBM). This breakdown does not consider + the *size* of the request (meaning that 32B and 64B requests are both counted + as a single request), so this metric only *approximates* the percent of the + L2-Fabric Read bandwidth directed to the local HBM. + unit: Percent + Remote Read Traffic: + rst: The percent of read requests generated by the L2 cache that are routed to any + memory location other than the accelerator's local high-bandwidth memory (HBM) + -- for example, the CPU's DRAM or a remote accelerator's HBM. This breakdown + does not consider the *size* of the request (meaning that 32B and 64B requests + are both counted as a single request), so this metric only *approximates* the + percent of the L2-Fabric Read bandwidth directed to a remote location. + unit: Percent + Uncached Read Traffic: + rst: The percent of read requests generated by the L2 cache that are reading from + an :ref:`uncached memory allocation `. Note, as described in the + :ref:`request flow ` section, a single 64B read request is + typically counted as two uncached read requests. So, it is possible for the + Uncached Read Traffic to reach up to 200% of the total number of read requests. + This breakdown does not consider the *size* of the request (i.e., 32B and 64B + requests are both counted as a single request), so this metric only *approximates* + the percent of the L2-Fabric read bandwidth directed to an uncached memory + location. + unit: Percent + Write and Atomic BW: + rst: The total number of bytes written by the L2 over Infinity Fabric by write and + atomic operations per :ref:`normalization unit `. Note + that on current CDNA accelerators, such as the :ref:`MI2XX `, requests + are only considered *atomic* by Infinity Fabric if they are targeted at non-write-cacheable + memory, for example, :ref:`fine-grained memory ` allocations or :ref:`uncached + memory ` allocations on the MI2XX. + unit: Bytes per normalization unit + HBM Write and Atomic Traffic: + rst: The percent of write and atomic requests generated by the L2 cache that are + routed to the accelerator's local high-bandwidth memory (HBM). This breakdown + does not consider the *size* of the request (meaning that 32B and 64B requests + are both counted as a single request), so this metric only *approximates* the + percent of the L2-Fabric Write and Atomic bandwidth directed to the local HBM. + Note that on current CDNA accelerators, such as the :ref:`MI2XX `, + requests are only considered *atomic* by Infinity Fabric if they are targeted + at :ref:`fine-grained memory ` allocations or :ref:`uncached + memory ` allocations. + unit: Percent + Remote Write and Atomic Traffic: + rst: The percent of read requests generated by the L2 cache that are routed to any + memory location other than the accelerator's local high-bandwidth memory (HBM) + -- for example, the CPU's DRAM or a remote accelerator's HBM. This breakdown + does not consider the *size* of the request (meaning that 32B and 64B requests + are both counted as a single request), so this metric only *approximates* the + percent of the L2-Fabric Read bandwidth directed to a remote location. Note + that on current CDNA accelerators, such as the :ref:`MI2XX `, requests + are only considered *atomic* by Infinity Fabric if they are targeted at :ref:`fine-grained + memory ` allocations or :ref:`uncached memory ` allocations. + unit: Percent + Atomic Traffic: + rst: The percent of write requests generated by the L2 cache that are atomic requests + to *any* memory location. This breakdown does not consider the *size* of the + request (meaning that 32B and 64B requests are both counted as a single request), + so this metric only *approximates* the percent of the L2-Fabric Read bandwidth + directed to a remote location. Note that on current CDNA accelerators, such + as the :ref:`MI2XX `, requests are only considered *atomic* by + Infinity Fabric if they are targeted at :ref:`fine-grained memory ` + allocations or :ref:`uncached memory ` allocations. + unit: Percent + Uncached Write and Atomic Traffic: + rst: The percent of write and atomic requests generated by the L2 cache that are + targeting :ref:`uncached memory allocations `. This breakdown + does not consider the *size* of the request (meaning that 32B and 64B requests + are both counted as a single request), so this metric only *approximates* the + percent of the L2-Fabric read bandwidth directed to uncached memory allocations. + unit: Percent + Read Latency: + rst: The time-averaged number of cycles read requests spent in Infinity Fabric before + data was returned to the L2. + unit: Cycles + Write and Atomic Latency: + rst: The time-averaged number of cycles write requests spent in Infinity Fabric + before a completion acknowledgement was returned to the L2. + unit: Cycles + Atomic Latency: + rst: The time-averaged number of cycles atomic requests spent in Infinity Fabric + before a completion acknowledgement (atomic without return value) or data (atomic + with return value) was returned to the L2. + unit: Cycles + Bandwidth: + rst: The number of bytes looked up in the L2 cache, per :ref:`normalization unit + `. The number of bytes is calculated as the number of + cache lines requested multiplied by the cache line size. This value does not + consider partial requests, so for example, if only a single value is requested + in a cache line, the data movement will still be counted as a full cache line. + unit: Bytes per normalization unit + Req: + rst: The total number of incoming requests to the L2 from all clients for all request + types, per :ref:`normalization unit `. + unit: Requests per normalization unit + Read Req: + rst: 'The total number of read requests to the L2 from all clients. ' + unit: Requests per normalization unit + Write Req: + rst: The total number of write requests to the L2 from all clients. + unit: Requests per normalization unit + Atomic Req: + rst: The total number of atomic requests (with and without return) to the L2 from + all clients. + unit: Requests per normalization unit + Streaming Req: + rst: The total number of incoming requests to the L2 that are marked as *streaming*. + The exact meaning of this may differ depending on the targeted accelerator, + however on an :ref:`MI2XX ` this corresponds to `non-temporal + load or stores `_. The + L2 cache attempts to evict *streaming* requests before normal requests when + the L2 is at capacity. + unit: Requests per normalization unit + Probe Req: + rst: The number of coherence probe requests made to the L2 cache from outside the + accelerator. On an :ref:`MI2XX `, probe requests may be generated + by, for example, writes to :ref:`fine-grained device ` memory + or by writes to :ref:`coarse-grained ` device memory. + unit: Requests per normalization unit + Cache Hit: + rst: The ratio of the number of L2 cache line requests that hit in the L2 cache + over the total number of incoming cache line requests to the L2 cache. + unit: Percent + Hits: + rst: The total number of requests to the L2 from all clients that hit in the cache. + As noted in the :ref:`Speed-of-Light ` section, this includes hit-on-miss + requests. + unit: Requests per normalization unit + Misses: + rst: The total number of requests to the L2 from all clients that miss in the cache. + As noted in the :ref:`Speed-of-Light ` section, these do not include + hit-on-miss requests. + unit: Requests per normalization unit + Writeback: + rst: The total number of L2 cache lines written back to memory for any reason. Write-backs + may occur due to user code (such as HIP kernel calls to ``__threadfence_system`` + or atomic built-ins) by the :doc:`command processor `'s + memory acquire/release fences, or for other internal hardware reasons. + unit: Cache lines per normalization unit + Writeback (Internal): + rst: The total number of L2 cache lines written back to memory for internal hardware + reasons, per :ref:`normalization unit `. + unit: Cache lines per normalization unit + Writeback (vL1D Req): + rst: The total number of L2 cache lines written back to memory due to requests initiated + by the :doc:`vL1D cache `, per :ref:`normalization unit `. + unit: Cache lines per normalization unit + Evict (Internal): + rst: The total number of L2 cache lines evicted from the cache due to capacity limits, + per :ref:`normalization unit `. + unit: Cache lines per normalization unit + Evict (vL1D Req): + rst: The total number of L2 cache lines evicted from the cache due to invalidation + requests initiated by the :doc:`vL1D cache `, per :ref:`normalization + unit `. + unit: Cache lines per normalization unit + NC Req: + rst: The total number of requests to the L2 to Not-hardware-Coherent (NC) memory + allocations, per :ref:`normalization unit `. See the :ref:`memory-type` + for more information. + unit: Requests per normalization unit + UC Req: + rst: The total number of requests to the L2 that go to Uncached (UC) memory allocations. + See the :ref:`memory-type` for more information. + unit: Requests per normalization unit + CC Req: + rst: The total number of requests to the L2 that go to Coherently Cacheable (CC) memory + allocations. See the :ref:`memory-type` for more information. + unit: Requests per normalization unit + RW Req: + rst: The total number of requests to the L2 that go to Read-Write coherent memory (RW) + allocations. See the :ref:`memory-type` for more information. + unit: Requests per normalization unit + Write - Credit Starvation: + rst: The number of cycles the L2-Fabric interface was stalled on write or atomic + requests to any memory location because too many write/atomic requests were + currently in flight, as a percent of the :ref:`total active L2 cycles `. + unit: Percent + Read (32B): + rst: The total number of L2 requests to Infinity Fabric to read 32B of data from + any memory location, per :ref:`normalization unit `. See :ref:`l2-request-flow` + for more detail. Typically unused on CDNA accelerators. + unit: Requests per normalization unit + Read (64B): + rst: The total number of L2 requests to Infinity Fabric to read 64B of data from + any memory location, per :ref:`normalization unit `. See :ref:`l2-request-flow` + for more detail. + unit: Requests per normalization unit + Read (Uncached): + rst: The total number of L2 requests to Infinity Fabric to read :ref:`uncached + data ` from any memory location, per :ref:`normalization unit + `. 64B requests for uncached data are counted as two 32B + uncached data requests. See :ref:`l2-request-flow` for more detail. + unit: Requests per normalization unit + HBM Read: + rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of data + from the accelerator's local HBM, per :ref:`normalization unit `. + See :ref:`l2-request-flow` for more detail. + unit: Requests per normalization unit + Remote Read: + rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of data + from any source other than the accelerator's local HBM, per :ref:`normalization + unit `. See :ref:`l2-request-flow` for more detail. + unit: Requests per normalization unit + Write and Atomic (32B): + rst: The total number of L2 requests to Infinity Fabric to write or atomically update + 32B of data to any memory location, per :ref:`normalization unit `. + See :ref:`l2-request-flow` for more detail. + unit: Requests per normalization unit + Write and Atomic (Uncached): + rst: The total number of L2 requests to Infinity Fabric to write or atomically update + 32B or 64B of :ref:`uncached data `, per :ref:`normalization unit + `. See :ref:`l2-request-flow` for more detail. + unit: Requests per normalization unit + Write and Atomic (64B): + rst: The total number of L2 requests to Infinity Fabric to write or atomically update + 64B of data in any memory location, per :ref:`normalization unit `. + See :ref:`l2-request-flow` for more detail. + unit: Requests per normalization unit + HBM Write and Atomic: + rst: The total number of L2 requests to Infinity Fabric to write or atomically update + 32B or 64B of data in the accelerator's local HBM, per :ref:`normalization + unit `. See :ref:`l2-request-flow` for more detail. plain + unit: Requests per normalization unit + Remote Write and Atomic: + rst: The total number of L2 requests to Infinity Fabric to write or atomically update + 32B or 64B of data in any memory location other than the accelerator's local + HBM, per :ref:`normalization unit `. See :ref:`l2-request-flow` + for more detail. + unit: Requests per normalization unit + Atomic: + rst: The total number of L2 requests to Infinity Fabric to atomically update 32B + or 64B of data in any memory location, per :ref:`normalization unit `. + See :ref:`l2-request-flow` for more detail. Note that on current CDNA accelerators, + such as the :ref:`MI2XX `, requests are only considered *atomic* + by Infinity Fabric if they are targeted at non-write-cacheable memory, such + as :ref:`fine-grained memory ` allocations or :ref:`uncached + memory ` allocations on the MI2XX. + unit: Requests per normalization unit + Read Stall: + rst: "The ratio of the total number of cycles the L2-Fabric interface was stalled\ + \ on a read request to any destination (local HBM, remote PCIe\xAE connected\ + \ accelerator or CPU, or remote Infinity Fabric connected accelerator [#inf]_\ + \ or CPU) over the :ref:`total active L2 cycles `." + unit: Percent + Write Stall: + rst: The ratio of the total number of cycles the L2-Fabric interface was stalled + on a write or atomic request to any destination (local HBM, remote accelerator + or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected + accelerator [#inf]_ or CPU) over the :ref:`total active L2 cycles `. + unit: Percent + Read - PCIe Stall: + rst: The number of cycles the L2-Fabric interface was stalled on read requests + to remote PCIe connected accelerators [#inf]_ or CPUs as a percent of the :ref:`total + active L2 cycles `. + unit: Percent + Read - Infinity Fabric Stall: + rst: The number of cycles the L2-Fabric interface was stalled on read requests + to remote Infinity Fabric connected accelerators [#inf]_ or CPUs as a percent + of the :ref:`total active L2 cycles `. + unit: Percent + Read - HBM Stall: + rst: The number of cycles the L2-Fabric interface was stalled on read requests + to the accelerator's local HBM as a percent of the :ref:`total active L2 cycles + `. + unit: Percent + Write - PCIe Stall: + rst: The number of cycles the L2-Fabric interface was stalled on write or atomic + requests to remote PCIe connected accelerators [#inf]_ or CPUs as a percent + of the :ref:`total active L2 cycles `. + unit: Percent + Write - Infinity Fabric Stall: + rst: The number of cycles the L2-Fabric interface was stalled on write or atomic + requests to remote Infinity Fabric connected accelerators [#inf]_ or CPUs as + a percent of the :ref:`total active L2 cycles `. + unit: Percent + Write - HBM Stall: + rst: The number of cycles the L2-Fabric interface was stalled on write or atomic + requests to accelerator's local HBM as a percent of the total active L2 cycles. + unit: Percent +L2 - Fabric Interface stalls: + Utilization: + rst: The ratio of the :ref:`number of cycles an L2 channel was active, summed + over all L2 channels on the accelerator ` over the + :ref:`total L2 cycles `. + unit: Percent + Peak Bandwidth: + rst: The number of bytes looked up in the L2 cache, as a percent of the peak theoretical + bandwidth achievable on the specific accelerator. The number of bytes is calculated + as the number of cache lines requested multiplied by the cache line size. This + value does not consider partial requests, so e.g., if only a single value is + requested in a cache line, the data movement will still be counted as a full + cache line. + unit: Percent + Hit Rate: + rst: The ratio of the number of L2 cache line requests that hit in the L2 cache + over the total number of incoming cache line requests to the L2 cache. + unit: Percent + L2-Fabric Read BW: + rst: The number of bytes read by the L2 over the :ref:`Infinity Fabric interface + ` per unit time. + unit: GB/s + L2-Fabric Write and Atomic BW: + rst: The number of bytes sent by the L2 over the :ref:`Infinity Fabric interface + ` by write and atomic operations per unit time. + unit: GB/s + HBM Bandwidth: + rst: Maximum theoretical bandwidth of the accelerator's local high-bandwidth memory + (HBM) per unit time. This value is calculated as the number of HBM channels + multiplied by the HBM channel width multiplied by the HBM clock frequency. + unit: GB/s + Read BW: + rst: The total number of bytes read by the L2 cache from Infinity Fabric per :ref:`normalization + unit `. + unit: Bytes per normalization unit + HBM Read Traffic: + rst: The percent of read requests generated by the L2 cache that are routed to the + accelerator's local high-bandwidth memory (HBM). This breakdown does not consider + the *size* of the request (meaning that 32B and 64B requests are both counted + as a single request), so this metric only *approximates* the percent of the + L2-Fabric Read bandwidth directed to the local HBM. + unit: Percent + Remote Read Traffic: + rst: The percent of read requests generated by the L2 cache that are routed to any + memory location other than the accelerator's local high-bandwidth memory (HBM) + -- for example, the CPU's DRAM or a remote accelerator's HBM. This breakdown + does not consider the *size* of the request (meaning that 32B and 64B requests + are both counted as a single request), so this metric only *approximates* the + percent of the L2-Fabric Read bandwidth directed to a remote location. + unit: Percent + Uncached Read Traffic: + rst: The percent of read requests generated by the L2 cache that are reading from + an :ref:`uncached memory allocation `. Note, as described in the + :ref:`request flow ` section, a single 64B read request is + typically counted as two uncached read requests. So, it is possible for the + Uncached Read Traffic to reach up to 200% of the total number of read requests. + This breakdown does not consider the *size* of the request (i.e., 32B and 64B + requests are both counted as a single request), so this metric only *approximates* + the percent of the L2-Fabric read bandwidth directed to an uncached memory + location. + unit: Percent + Write and Atomic BW: + rst: The total number of bytes written by the L2 over Infinity Fabric by write and + atomic operations per :ref:`normalization unit `. Note + that on current CDNA accelerators, such as the :ref:`MI2XX `, requests + are only considered *atomic* by Infinity Fabric if they are targeted at non-write-cacheable + memory, for example, :ref:`fine-grained memory ` allocations or :ref:`uncached + memory ` allocations on the MI2XX. + unit: Bytes per normalization unit + HBM Write and Atomic Traffic: + rst: The percent of write and atomic requests generated by the L2 cache that are + routed to the accelerator's local high-bandwidth memory (HBM). This breakdown + does not consider the *size* of the request (meaning that 32B and 64B requests + are both counted as a single request), so this metric only *approximates* the + percent of the L2-Fabric Write and Atomic bandwidth directed to the local HBM. + Note that on current CDNA accelerators, such as the :ref:`MI2XX `, + requests are only considered *atomic* by Infinity Fabric if they are targeted + at :ref:`fine-grained memory ` allocations or :ref:`uncached + memory ` allocations. + unit: Percent + Remote Write and Atomic Traffic: + rst: The percent of read requests generated by the L2 cache that are routed to any + memory location other than the accelerator's local high-bandwidth memory (HBM) + -- for example, the CPU's DRAM or a remote accelerator's HBM. This breakdown + does not consider the *size* of the request (meaning that 32B and 64B requests + are both counted as a single request), so this metric only *approximates* the + percent of the L2-Fabric Read bandwidth directed to a remote location. Note + that on current CDNA accelerators, such as the :ref:`MI2XX `, requests + are only considered *atomic* by Infinity Fabric if they are targeted at :ref:`fine-grained + memory ` allocations or :ref:`uncached memory ` allocations. + unit: Percent + Atomic Traffic: + rst: The percent of write requests generated by the L2 cache that are atomic requests + to *any* memory location. This breakdown does not consider the *size* of the + request (meaning that 32B and 64B requests are both counted as a single request), + so this metric only *approximates* the percent of the L2-Fabric Read bandwidth + directed to a remote location. Note that on current CDNA accelerators, such + as the :ref:`MI2XX `, requests are only considered *atomic* by + Infinity Fabric if they are targeted at :ref:`fine-grained memory ` + allocations or :ref:`uncached memory ` allocations. + unit: Percent + Uncached Write and Atomic Traffic: + rst: The percent of write and atomic requests generated by the L2 cache that are + targeting :ref:`uncached memory allocations `. This breakdown + does not consider the *size* of the request (meaning that 32B and 64B requests + are both counted as a single request), so this metric only *approximates* the + percent of the L2-Fabric read bandwidth directed to uncached memory allocations. + unit: Percent + Read Latency: + rst: The time-averaged number of cycles read requests spent in Infinity Fabric before + data was returned to the L2. + unit: Cycles + Write and Atomic Latency: + rst: The time-averaged number of cycles write requests spent in Infinity Fabric + before a completion acknowledgement was returned to the L2. + unit: Cycles + Atomic Latency: + rst: The time-averaged number of cycles atomic requests spent in Infinity Fabric + before a completion acknowledgement (atomic without return value) or data (atomic + with return value) was returned to the L2. + unit: Cycles + Bandwidth: + rst: The number of bytes looked up in the L2 cache, per :ref:`normalization unit + `. The number of bytes is calculated as the number of + cache lines requested multiplied by the cache line size. This value does not + consider partial requests, so for example, if only a single value is requested + in a cache line, the data movement will still be counted as a full cache line. + unit: Bytes per normalization unit + Req: + rst: The total number of incoming requests to the L2 from all clients for all request + types, per :ref:`normalization unit `. + unit: Requests per normalization unit + Read Req: + rst: 'The total number of read requests to the L2 from all clients. ' + unit: Requests per normalization unit + Write Req: + rst: The total number of write requests to the L2 from all clients. + unit: Requests per normalization unit + Atomic Req: + rst: The total number of atomic requests (with and without return) to the L2 from + all clients. + unit: Requests per normalization unit + Streaming Req: + rst: The total number of incoming requests to the L2 that are marked as *streaming*. + The exact meaning of this may differ depending on the targeted accelerator, + however on an :ref:`MI2XX ` this corresponds to `non-temporal + load or stores `_. The + L2 cache attempts to evict *streaming* requests before normal requests when + the L2 is at capacity. + unit: Requests per normalization unit + Probe Req: + rst: The number of coherence probe requests made to the L2 cache from outside the + accelerator. On an :ref:`MI2XX `, probe requests may be generated + by, for example, writes to :ref:`fine-grained device ` memory + or by writes to :ref:`coarse-grained ` device memory. + unit: Requests per normalization unit + Cache Hit: + rst: The ratio of the number of L2 cache line requests that hit in the L2 cache + over the total number of incoming cache line requests to the L2 cache. + unit: Percent + Hits: + rst: The total number of requests to the L2 from all clients that hit in the cache. + As noted in the :ref:`Speed-of-Light ` section, this includes hit-on-miss + requests. + unit: Requests per normalization unit + Misses: + rst: The total number of requests to the L2 from all clients that miss in the cache. + As noted in the :ref:`Speed-of-Light ` section, these do not include + hit-on-miss requests. + unit: Requests per normalization unit + Writeback: + rst: The total number of L2 cache lines written back to memory for any reason. Write-backs + may occur due to user code (such as HIP kernel calls to ``__threadfence_system`` + or atomic built-ins) by the :doc:`command processor `'s + memory acquire/release fences, or for other internal hardware reasons. + unit: Cache lines per normalization unit + Writeback (Internal): + rst: The total number of L2 cache lines written back to memory for internal hardware + reasons, per :ref:`normalization unit `. + unit: Cache lines per normalization unit + Writeback (vL1D Req): + rst: The total number of L2 cache lines written back to memory due to requests initiated + by the :doc:`vL1D cache `, per :ref:`normalization unit `. + unit: Cache lines per normalization unit + Evict (Internal): + rst: The total number of L2 cache lines evicted from the cache due to capacity limits, + per :ref:`normalization unit `. + unit: Cache lines per normalization unit + Evict (vL1D Req): + rst: The total number of L2 cache lines evicted from the cache due to invalidation + requests initiated by the :doc:`vL1D cache `, per :ref:`normalization + unit `. + unit: Cache lines per normalization unit + NC Req: + rst: The total number of requests to the L2 to Not-hardware-Coherent (NC) memory + allocations, per :ref:`normalization unit `. See the :ref:`memory-type` + for more information. + unit: Requests per normalization unit + UC Req: + rst: The total number of requests to the L2 that go to Uncached (UC) memory allocations. + See the :ref:`memory-type` for more information. + unit: Requests per normalization unit + CC Req: + rst: The total number of requests to the L2 that go to Coherently Cacheable (CC) memory + allocations. See the :ref:`memory-type` for more information. + unit: Requests per normalization unit + RW Req: + rst: The total number of requests to the L2 that go to Read-Write coherent memory (RW) + allocations. See the :ref:`memory-type` for more information. + unit: Requests per normalization unit + Write - Credit Starvation: + rst: The number of cycles the L2-Fabric interface was stalled on write or atomic + requests to any memory location because too many write/atomic requests were + currently in flight, as a percent of the :ref:`total active L2 cycles `. + unit: Percent + Read (32B): + rst: The total number of L2 requests to Infinity Fabric to read 32B of data from + any memory location, per :ref:`normalization unit `. See :ref:`l2-request-flow` + for more detail. Typically unused on CDNA accelerators. + unit: Requests per normalization unit + Read (64B): + rst: The total number of L2 requests to Infinity Fabric to read 64B of data from + any memory location, per :ref:`normalization unit `. See :ref:`l2-request-flow` + for more detail. + unit: Requests per normalization unit + Read (Uncached): + rst: The total number of L2 requests to Infinity Fabric to read :ref:`uncached + data ` from any memory location, per :ref:`normalization unit + `. 64B requests for uncached data are counted as two 32B + uncached data requests. See :ref:`l2-request-flow` for more detail. + unit: Requests per normalization unit + HBM Read: + rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of data + from the accelerator's local HBM, per :ref:`normalization unit `. + See :ref:`l2-request-flow` for more detail. + unit: Requests per normalization unit + Remote Read: + rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of data + from any source other than the accelerator's local HBM, per :ref:`normalization + unit `. See :ref:`l2-request-flow` for more detail. + unit: Requests per normalization unit + Write and Atomic (32B): + rst: The total number of L2 requests to Infinity Fabric to write or atomically update + 32B of data to any memory location, per :ref:`normalization unit `. + See :ref:`l2-request-flow` for more detail. + unit: Requests per normalization unit + Write and Atomic (Uncached): + rst: The total number of L2 requests to Infinity Fabric to write or atomically update + 32B or 64B of :ref:`uncached data `, per :ref:`normalization unit + `. See :ref:`l2-request-flow` for more detail. + unit: Requests per normalization unit + Write and Atomic (64B): + rst: The total number of L2 requests to Infinity Fabric to write or atomically update + 64B of data in any memory location, per :ref:`normalization unit `. + See :ref:`l2-request-flow` for more detail. + unit: Requests per normalization unit + HBM Write and Atomic: + rst: The total number of L2 requests to Infinity Fabric to write or atomically update + 32B or 64B of data in the accelerator's local HBM, per :ref:`normalization + unit `. See :ref:`l2-request-flow` for more detail. plain + unit: Requests per normalization unit + Remote Write and Atomic: + rst: The total number of L2 requests to Infinity Fabric to write or atomically update + 32B or 64B of data in any memory location other than the accelerator's local + HBM, per :ref:`normalization unit `. See :ref:`l2-request-flow` + for more detail. + unit: Requests per normalization unit + Atomic: + rst: The total number of L2 requests to Infinity Fabric to atomically update 32B + or 64B of data in any memory location, per :ref:`normalization unit `. + See :ref:`l2-request-flow` for more detail. Note that on current CDNA accelerators, + such as the :ref:`MI2XX `, requests are only considered *atomic* + by Infinity Fabric if they are targeted at non-write-cacheable memory, such + as :ref:`fine-grained memory ` allocations or :ref:`uncached + memory ` allocations on the MI2XX. + unit: Requests per normalization unit + Read Stall: + rst: "The ratio of the total number of cycles the L2-Fabric interface was stalled\ + \ on a read request to any destination (local HBM, remote PCIe\xAE connected\ + \ accelerator or CPU, or remote Infinity Fabric connected accelerator [#inf]_\ + \ or CPU) over the :ref:`total active L2 cycles `." + unit: Percent + Write Stall: + rst: The ratio of the total number of cycles the L2-Fabric interface was stalled + on a write or atomic request to any destination (local HBM, remote accelerator + or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected + accelerator [#inf]_ or CPU) over the :ref:`total active L2 cycles `. + unit: Percent + Read - PCIe Stall: + rst: The number of cycles the L2-Fabric interface was stalled on read requests + to remote PCIe connected accelerators [#inf]_ or CPUs as a percent of the :ref:`total + active L2 cycles `. + unit: Percent + Read - Infinity Fabric Stall: + rst: The number of cycles the L2-Fabric interface was stalled on read requests + to remote Infinity Fabric connected accelerators [#inf]_ or CPUs as a percent + of the :ref:`total active L2 cycles `. + unit: Percent + Read - HBM Stall: + rst: The number of cycles the L2-Fabric interface was stalled on read requests + to the accelerator's local HBM as a percent of the :ref:`total active L2 cycles + `. + unit: Percent + Write - PCIe Stall: + rst: The number of cycles the L2-Fabric interface was stalled on write or atomic + requests to remote PCIe connected accelerators [#inf]_ or CPUs as a percent + of the :ref:`total active L2 cycles `. + unit: Percent + Write - Infinity Fabric Stall: + rst: The number of cycles the L2-Fabric interface was stalled on write or atomic + requests to remote Infinity Fabric connected accelerators [#inf]_ or CPUs as + a percent of the :ref:`total active L2 cycles `. + unit: Percent + Write - HBM Stall: + rst: The number of cycles the L2-Fabric interface was stalled on write or atomic + requests to accelerator's local HBM as a percent of the total active L2 cycles. + unit: Percent +Scalar L1D Speed-of-Light: + Bandwidth: + rst: The number of bytes looked up in the sL1D cache, as a percent of the peak theoretical + bandwidth. Calculated as the ratio of sL1D requests over the :ref:`total sL1D + cycles `. + unit: Percent + Cache Hit Rate: + rst: Indicates the percent of sL1D requests that hit on a previously loaded line + the cache. The ratio of the number of sL1D requests that hit [#sl1d-cache]_ + over the number of all sL1D requests. + unit: Percent + sL1D-L2 BW: + rst: "The total number of bytes read from, written to, or atomically updated \ + \ across the sL1D\u2194:doc:`L2 ` interface, per :ref:`normalization\ + \ unit `. Note that sL1D writes and atomics are typically\ + \ unused on current CDNA accelerators, so in the majority of cases this can\ + \ be interpreted as an sL1D\u2192L2 read bandwidth." + unit: Bytes per normalization unit + Req: + rst: The total number of requests, of any size or type, made to the sL1D per :ref:`normalization + unit `. + unit: Requests per normalization unit + Hits: + rst: The total number of sL1D requests that hit on a previously loaded cache line, + per :ref:`normalization unit `. + unit: Requests per normalization unit + Misses - Non Duplicated: + rst: The total number of sL1D requests that missed on a cache line that *was not* + already pending due to another request, per :ref:`normalization unit `. + See :ref:`desc-sl1d-sol` for more detail. + unit: Requests per normalization unit + Misses- Duplicated: + rst: The total number of sL1D requests that missed on a cache line that *was* already + pending due to another request, per :ref:`normalization unit `. + See :ref:`desc-sl1d-sol` for more detail. + unit: Requests per normalization unit + Read Req (Total): + rst: The total number of sL1D read requests of any size, per :ref:`normalization + unit `. + unit: Requests per normalization unit + Atomic Req: + rst: The total number of atomic requests from sL1D to the :doc:`L2 `, + per :ref:`normalization unit `. Typically unused on current + CDNA accelerators. + unit: Requests per normalization unit + Read Req (1 DWord): + rst: The total number of sL1D read requests made for a single dword of data (4B), + per :ref:`normalization unit `. + unit: Requests per normalization unit + Read Req (2 DWord): + rst: The total number of sL1D read requests made for a two dwords of data (8B), + per :ref:`normalization unit `. + unit: Requests per normalization unit + Read Req (4 DWord): + rst: The total number of sL1D read requests made for a four dwords of data (16B), + per :ref:`normalization unit `. + unit: Requests per normalization unit + Read Req (8 DWord): + rst: The total number of sL1D read requests made for a eight dwords of data (32B), + per :ref:`normalization unit `. + unit: Requests per normalization unit + Read Req (16 DWord): + rst: The total number of sL1D read requests made for a sixteen dwords of data (64B), + per :ref:`normalization unit `. + unit: Requests per normalization unit + Read Req: + rst: The total number of read requests from sL1D to the :doc:`L2 `, per + :ref:`normalization unit `. + unit: Requests per normalization unit + Write Req: + rst: The total number of write requests from sL1D to the :doc:`L2 `, per + :ref:`normalization unit `. Typically unused on current + CDNA accelerators. + unit: Requests per normalization unit + Stall Cycles: + rst: "The total number of cycles the sL1D\u2194 :doc:`L2 ` interface\ + \ was stalled, per :ref:`normalization unit `." + unit: Cycles per normalization unit +Scalar L1D cache accesses: + Bandwidth: + rst: The number of bytes looked up in the sL1D cache, as a percent of the peak theoretical + bandwidth. Calculated as the ratio of sL1D requests over the :ref:`total sL1D + cycles `. + unit: Percent + Cache Hit Rate: + rst: Indicates the percent of sL1D requests that hit on a previously loaded line + the cache. The ratio of the number of sL1D requests that hit [#sl1d-cache]_ + over the number of all sL1D requests. + unit: Percent + sL1D-L2 BW: + rst: "The total number of bytes read from, written to, or atomically updated \ + \ across the sL1D\u2194:doc:`L2 ` interface, per :ref:`normalization\ + \ unit `. Note that sL1D writes and atomics are typically\ + \ unused on current CDNA accelerators, so in the majority of cases this can\ + \ be interpreted as an sL1D\u2192L2 read bandwidth." + unit: Bytes per normalization unit + Req: + rst: The total number of requests, of any size or type, made to the sL1D per :ref:`normalization + unit `. + unit: Requests per normalization unit + Hits: + rst: The total number of sL1D requests that hit on a previously loaded cache line, + per :ref:`normalization unit `. + unit: Requests per normalization unit + Misses - Non Duplicated: + rst: The total number of sL1D requests that missed on a cache line that *was not* + already pending due to another request, per :ref:`normalization unit `. + See :ref:`desc-sl1d-sol` for more detail. + unit: Requests per normalization unit + Misses- Duplicated: + rst: The total number of sL1D requests that missed on a cache line that *was* already + pending due to another request, per :ref:`normalization unit `. + See :ref:`desc-sl1d-sol` for more detail. + unit: Requests per normalization unit + Read Req (Total): + rst: The total number of sL1D read requests of any size, per :ref:`normalization + unit `. + unit: Requests per normalization unit + Atomic Req: + rst: The total number of atomic requests from sL1D to the :doc:`L2 `, + per :ref:`normalization unit `. Typically unused on current + CDNA accelerators. + unit: Requests per normalization unit + Read Req (1 DWord): + rst: The total number of sL1D read requests made for a single dword of data (4B), + per :ref:`normalization unit `. + unit: Requests per normalization unit + Read Req (2 DWord): + rst: The total number of sL1D read requests made for a two dwords of data (8B), + per :ref:`normalization unit `. + unit: Requests per normalization unit + Read Req (4 DWord): + rst: The total number of sL1D read requests made for a four dwords of data (16B), + per :ref:`normalization unit `. + unit: Requests per normalization unit + Read Req (8 DWord): + rst: The total number of sL1D read requests made for a eight dwords of data (32B), + per :ref:`normalization unit `. + unit: Requests per normalization unit + Read Req (16 DWord): + rst: The total number of sL1D read requests made for a sixteen dwords of data (64B), + per :ref:`normalization unit `. + unit: Requests per normalization unit + Read Req: + rst: The total number of read requests from sL1D to the :doc:`L2 `, per + :ref:`normalization unit `. + unit: Requests per normalization unit + Write Req: + rst: The total number of write requests from sL1D to the :doc:`L2 `, per + :ref:`normalization unit `. Typically unused on current + CDNA accelerators. + unit: Requests per normalization unit + Stall Cycles: + rst: "The total number of cycles the sL1D\u2194 :doc:`L2 ` interface\ + \ was stalled, per :ref:`normalization unit `." + unit: Cycles per normalization unit +Scalar L1D Cache - L2 Interface: + Bandwidth: + rst: The number of bytes looked up in the sL1D cache, as a percent of the peak theoretical + bandwidth. Calculated as the ratio of sL1D requests over the :ref:`total sL1D + cycles `. + unit: Percent + Cache Hit Rate: + rst: Indicates the percent of sL1D requests that hit on a previously loaded line + the cache. The ratio of the number of sL1D requests that hit [#sl1d-cache]_ + over the number of all sL1D requests. + unit: Percent + sL1D-L2 BW: + rst: "The total number of bytes read from, written to, or atomically updated \ + \ across the sL1D\u2194:doc:`L2 ` interface, per :ref:`normalization\ + \ unit `. Note that sL1D writes and atomics are typically\ + \ unused on current CDNA accelerators, so in the majority of cases this can\ + \ be interpreted as an sL1D\u2192L2 read bandwidth." + unit: Bytes per normalization unit + Req: + rst: The total number of requests, of any size or type, made to the sL1D per :ref:`normalization + unit `. + unit: Requests per normalization unit + Hits: + rst: The total number of sL1D requests that hit on a previously loaded cache line, + per :ref:`normalization unit `. + unit: Requests per normalization unit + Misses - Non Duplicated: + rst: The total number of sL1D requests that missed on a cache line that *was not* + already pending due to another request, per :ref:`normalization unit `. + See :ref:`desc-sl1d-sol` for more detail. + unit: Requests per normalization unit + Misses- Duplicated: + rst: The total number of sL1D requests that missed on a cache line that *was* already + pending due to another request, per :ref:`normalization unit `. + See :ref:`desc-sl1d-sol` for more detail. + unit: Requests per normalization unit + Read Req (Total): + rst: The total number of sL1D read requests of any size, per :ref:`normalization + unit `. + unit: Requests per normalization unit + Atomic Req: + rst: The total number of atomic requests from sL1D to the :doc:`L2 `, + per :ref:`normalization unit `. Typically unused on current + CDNA accelerators. + unit: Requests per normalization unit + Read Req (1 DWord): + rst: The total number of sL1D read requests made for a single dword of data (4B), + per :ref:`normalization unit `. + unit: Requests per normalization unit + Read Req (2 DWord): + rst: The total number of sL1D read requests made for a two dwords of data (8B), + per :ref:`normalization unit `. + unit: Requests per normalization unit + Read Req (4 DWord): + rst: The total number of sL1D read requests made for a four dwords of data (16B), + per :ref:`normalization unit `. + unit: Requests per normalization unit + Read Req (8 DWord): + rst: The total number of sL1D read requests made for a eight dwords of data (32B), + per :ref:`normalization unit `. + unit: Requests per normalization unit + Read Req (16 DWord): + rst: The total number of sL1D read requests made for a sixteen dwords of data (64B), + per :ref:`normalization unit `. + unit: Requests per normalization unit + Read Req: + rst: The total number of read requests from sL1D to the :doc:`L2 `, per + :ref:`normalization unit `. + unit: Requests per normalization unit + Write Req: + rst: The total number of write requests from sL1D to the :doc:`L2 `, per + :ref:`normalization unit `. Typically unused on current + CDNA accelerators. + unit: Requests per normalization unit + Stall Cycles: + rst: "The total number of cycles the sL1D\u2194 :doc:`L2 ` interface\ + \ was stalled, per :ref:`normalization unit `." + unit: Cycles per normalization unit +L1I Speed-of-Light: + Bandwidth: + rst: The number of bytes looked up in the L1I cache, as a percent of the peak theoretical + bandwidth. Calculated as the ratio of L1I requests over the :ref:`total L1I + cycles `. + unit: Percent + Cache Hit Rate: + rst: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded line + the cache. Calculated as the ratio of the number of L1I requests that hit over + the number of all L1I requests. + unit: Percent + L1I-L2 Bandwidth: + rst: "The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth\ + \ achieved. Calculated as the ratio of the total number of requests from the\ + \ L1I to the L2 cache over the :ref:`total L1I-L2 interface cycles `." + unit: Percent + Req: + rst: The total number of requests made to the L1I per normalization-unit + unit: Requests per normalization unit + Hits: + rst: The total number of L1I requests that hit on a previously loaded cache line, + per :ref:`normalization-unit `. + unit: Requests per normalization unit + Misses - Non Duplicated: + rst: The total number of L1I requests that missed on a cache line that *were + not* already pending due to another request, per :ref:`normalization-unit `. + See note in :ref:`desc-l1i-sol` for more detail. + unit: Requests per normalization unit + Misses - Duplicated: + rst: The total number of L1I requests that missed on a cache line that *were* already + pending due to another request, per :ref:`normalization-unit `. + See note in :ref:`desc-l1i-sol` for more detail. + unit: Requests per normalization unit + Instruction Fetch Latency: + rst: The average number of cycles spent to fetch instructions to a :doc:`CU `. + unit: Cycles +L1I cache accesses: + Bandwidth: + rst: The number of bytes looked up in the L1I cache, as a percent of the peak theoretical + bandwidth. Calculated as the ratio of L1I requests over the :ref:`total L1I + cycles `. + unit: Percent + Cache Hit Rate: + rst: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded line + the cache. Calculated as the ratio of the number of L1I requests that hit over + the number of all L1I requests. + unit: Percent + L1I-L2 Bandwidth: + rst: "The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth\ + \ achieved. Calculated as the ratio of the total number of requests from the\ + \ L1I to the L2 cache over the :ref:`total L1I-L2 interface cycles `." + unit: Percent + Req: + rst: The total number of requests made to the L1I per normalization-unit + unit: Requests per normalization unit + Hits: + rst: The total number of L1I requests that hit on a previously loaded cache line, + per :ref:`normalization-unit `. + unit: Requests per normalization unit + Misses - Non Duplicated: + rst: The total number of L1I requests that missed on a cache line that *were + not* already pending due to another request, per :ref:`normalization-unit `. + See note in :ref:`desc-l1i-sol` for more detail. + unit: Requests per normalization unit + Misses - Duplicated: + rst: The total number of L1I requests that missed on a cache line that *were* already + pending due to another request, per :ref:`normalization-unit `. + See note in :ref:`desc-l1i-sol` for more detail. + unit: Requests per normalization unit + Instruction Fetch Latency: + rst: The average number of cycles spent to fetch instructions to a :doc:`CU `. + unit: Cycles +L1I <-> L2 interface: + Bandwidth: + rst: The number of bytes looked up in the L1I cache, as a percent of the peak theoretical + bandwidth. Calculated as the ratio of L1I requests over the :ref:`total L1I + cycles `. + unit: Percent + Cache Hit Rate: + rst: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded line + the cache. Calculated as the ratio of the number of L1I requests that hit over + the number of all L1I requests. + unit: Percent + L1I-L2 Bandwidth: + rst: "The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth\ + \ achieved. Calculated as the ratio of the total number of requests from the\ + \ L1I to the L2 cache over the :ref:`total L1I-L2 interface cycles `." + unit: Percent + Req: + rst: The total number of requests made to the L1I per normalization-unit + unit: Requests per normalization unit + Hits: + rst: The total number of L1I requests that hit on a previously loaded cache line, + per :ref:`normalization-unit `. + unit: Requests per normalization unit + Misses - Non Duplicated: + rst: The total number of L1I requests that missed on a cache line that *were + not* already pending due to another request, per :ref:`normalization-unit `. + See note in :ref:`desc-l1i-sol` for more detail. + unit: Requests per normalization unit + Misses - Duplicated: + rst: The total number of L1I requests that missed on a cache line that *were* already + pending due to another request, per :ref:`normalization-unit `. + See note in :ref:`desc-l1i-sol` for more detail. + unit: Requests per normalization unit + Instruction Fetch Latency: + rst: The average number of cycles spent to fetch instructions to a :doc:`CU `. + unit: Cycles +Workgroup manager utilizations: + Accelerator Utilization: + rst: The percent of cycles in the kernel where the accelerator was actively doing + any work. + unit: Percent + Scheduler-Pipe Utilization: + rst: 'The percent of :ref:`total scheduler-pipe cycles ` in the + kernel where the scheduler-pipes were actively doing any work. Note: this value + is expected to range between 0% and 25%. See :ref:`desc-spi`.' + unit: Percent + Workgroup Manager Utilization: + rst: The percent of cycles in the kernel where the workgroup manager was actively + doing any work. + unit: Percent + Shader Engine Utilization: + rst: The percent of :ref:`total shader engine cycles ` in the kernel + where any CU in a shader-engine was actively doing any work, normalized over + all shader-engines. Low values (e.g., << 100%) indicate that the accelerator + was not fully saturated by the kernel, or a potential load-imbalance issue. + unit: Percent + SIMD Utilization: + rst: The percent of :ref:`total SIMD cycles ` in the kernel where + any :ref:`SIMD ` on a CU was actively doing any work, summed over + all CUs. Low values (less than 100%) indicate that the accelerator was not + fully saturated by the kernel, or a potential load-imbalance issue. + unit: Percent + Dispatched Workgroups: + rst: The total number of workgroups forming this kernel launch. + unit: Workgroups + Dispatched Wavefronts: + rst: The total number of wavefronts, summed over all workgroups, forming this + kernel launch. + unit: Wavefronts + VGPR Writes: + rst: The average number of cycles spent initializing :ref:`VGPRs ` at + wave creation. + unit: Cycles/wave + SGPR Writes: + rst: The average number of cycles spent initializing :ref:`SGPRs ` at + wave creation. + unit: Cycles/wave + Not-scheduled Rate (Workgroup Manager): + rst: 'The percent of :ref:`total scheduler-pipe cycles ` in the + kernel where a workgroup could not be scheduled to a :doc:`CU ` + due to a bottleneck within the workgroup manager rather than a lack of a CU + or :ref:`SIMD ` with sufficient resources. Note: this value is expected + to range between 0-25%. See note in :ref:`workgroup manager ` description.' + unit: Percent + Not-scheduled Rate (Scheduler-Pipe): + rst: 'The percent of :ref:`total scheduler-pipe cycles ` in the + kernel where a workgroup could not be scheduled to a :doc:`CU ` + due to a bottleneck within the scheduler-pipes rather than a lack of a CU or + :ref:`SIMD ` with sufficient resources. Note: this value is expected + to range between 0-25%, see note in :ref:`workgroup manager ` description.' + unit: Percent + Scheduler-Pipe Stall Rate: + rst: 'The percent of :ref:`total scheduler-pipe cycles ` in the + kernel where a workgroup could not be scheduled to a :doc:`CU ` + due to occupancy limitations (like a lack of a CU or :ref:`SIMD ` + with sufficient resources). Note: this value is expected to range between 0-25%, + see note in :ref:`workgroup manager ` description.' + unit: Percent + Scratch Stall Rate: + rst: The percent of :ref:`total shader-engine cycles ` in the kernel + where a workgroup could not be scheduled to a :doc:`CU ` due + to lack of :ref:`private (a.k.a., scratch) memory ` slots. While + this can reach up to 100%, note that the actual occupancy limitations on a kernel + using private memory are typically quite small (for example, less than 1% of + the total number of waves that can be scheduled to an accelerator). + unit: Percent + Insufficient SIMD Waveslots: + rst: The percent of :ref:`total SIMD cycles ` in the kernel where + a workgroup could not be scheduled to a :ref:`SIMD ` due to lack + of available :ref:`waveslots `. + unit: Percent + Insufficient SIMD VGPRs: + rst: The percent of :ref:`total SIMD cycles ` in the kernel where + a workgroup could not be scheduled to a :ref:`SIMD ` due to lack + of available :ref:`VGPRs `. + unit: Percent + Insufficient SIMD SGPRs: + rst: The percent of :ref:`total SIMD cycles ` in the kernel where + a workgroup could not be scheduled to a :ref:`SIMD ` due to lack + of available :ref:`SGPRs `. + unit: Percent + Insufficient CU LDS: + rst: The percent of :ref:`total CU cycles ` in the kernel where + a workgroup could not be scheduled to a :doc:`CU ` due to lack + of available :doc:`LDS `. + unit: Percent + Insufficient CU Barriers: + rst: The percent of :ref:`total CU cycles ` in the kernel where + a workgroup could not be scheduled to a :doc:`CU ` due to lack + of available :ref:`barriers `. + unit: Percent + Reached CU Workgroup Limit: + rst: The percent of :ref:`total CU cycles ` in the kernel where + a workgroup could not be scheduled to a :doc:`CU ` due to limits + within the workgroup manager. This is expected to be always be zero on CDNA2 + or newer accelerators (and small for previous accelerators). + unit: Percent + Reached CU Wavefront Limit: + rst: The percent of :ref:`total CU cycles ` in the kernel where + a wavefront could not be scheduled to a :doc:`CU ` due to limits + within the workgroup manager. This is expected to be always be zero on CDNA2 + or newer accelerators (and small for previous accelerators). + unit: Percent +Workgroup Manager - Resource Allocation: + Accelerator Utilization: + rst: The percent of cycles in the kernel where the accelerator was actively doing + any work. + unit: Percent + Scheduler-Pipe Utilization: + rst: 'The percent of :ref:`total scheduler-pipe cycles ` in the + kernel where the scheduler-pipes were actively doing any work. Note: this value + is expected to range between 0% and 25%. See :ref:`desc-spi`.' + unit: Percent + Workgroup Manager Utilization: + rst: The percent of cycles in the kernel where the workgroup manager was actively + doing any work. + unit: Percent + Shader Engine Utilization: + rst: The percent of :ref:`total shader engine cycles ` in the kernel + where any CU in a shader-engine was actively doing any work, normalized over + all shader-engines. Low values (e.g., << 100%) indicate that the accelerator + was not fully saturated by the kernel, or a potential load-imbalance issue. + unit: Percent + SIMD Utilization: + rst: The percent of :ref:`total SIMD cycles ` in the kernel where + any :ref:`SIMD ` on a CU was actively doing any work, summed over + all CUs. Low values (less than 100%) indicate that the accelerator was not + fully saturated by the kernel, or a potential load-imbalance issue. + unit: Percent + Dispatched Workgroups: + rst: The total number of workgroups forming this kernel launch. + unit: Workgroups + Dispatched Wavefronts: + rst: The total number of wavefronts, summed over all workgroups, forming this + kernel launch. + unit: Wavefronts + VGPR Writes: + rst: The average number of cycles spent initializing :ref:`VGPRs ` at + wave creation. + unit: Cycles/wave + SGPR Writes: + rst: The average number of cycles spent initializing :ref:`SGPRs ` at + wave creation. + unit: Cycles/wave + Not-scheduled Rate (Workgroup Manager): + rst: 'The percent of :ref:`total scheduler-pipe cycles ` in the + kernel where a workgroup could not be scheduled to a :doc:`CU ` + due to a bottleneck within the workgroup manager rather than a lack of a CU + or :ref:`SIMD ` with sufficient resources. Note: this value is expected + to range between 0-25%. See note in :ref:`workgroup manager ` description.' + unit: Percent + Not-scheduled Rate (Scheduler-Pipe): + rst: 'The percent of :ref:`total scheduler-pipe cycles ` in the + kernel where a workgroup could not be scheduled to a :doc:`CU ` + due to a bottleneck within the scheduler-pipes rather than a lack of a CU or + :ref:`SIMD ` with sufficient resources. Note: this value is expected + to range between 0-25%, see note in :ref:`workgroup manager ` description.' + unit: Percent + Scheduler-Pipe Stall Rate: + rst: 'The percent of :ref:`total scheduler-pipe cycles ` in the + kernel where a workgroup could not be scheduled to a :doc:`CU ` + due to occupancy limitations (like a lack of a CU or :ref:`SIMD ` + with sufficient resources). Note: this value is expected to range between 0-25%, + see note in :ref:`workgroup manager ` description.' + unit: Percent + Scratch Stall Rate: + rst: The percent of :ref:`total shader-engine cycles ` in the kernel + where a workgroup could not be scheduled to a :doc:`CU ` due + to lack of :ref:`private (a.k.a., scratch) memory ` slots. While + this can reach up to 100%, note that the actual occupancy limitations on a kernel + using private memory are typically quite small (for example, less than 1% of + the total number of waves that can be scheduled to an accelerator). + unit: Percent + Insufficient SIMD Waveslots: + rst: The percent of :ref:`total SIMD cycles ` in the kernel where + a workgroup could not be scheduled to a :ref:`SIMD ` due to lack + of available :ref:`waveslots `. + unit: Percent + Insufficient SIMD VGPRs: + rst: The percent of :ref:`total SIMD cycles ` in the kernel where + a workgroup could not be scheduled to a :ref:`SIMD ` due to lack + of available :ref:`VGPRs `. + unit: Percent + Insufficient SIMD SGPRs: + rst: The percent of :ref:`total SIMD cycles ` in the kernel where + a workgroup could not be scheduled to a :ref:`SIMD ` due to lack + of available :ref:`SGPRs `. + unit: Percent + Insufficient CU LDS: + rst: The percent of :ref:`total CU cycles ` in the kernel where + a workgroup could not be scheduled to a :doc:`CU ` due to lack + of available :doc:`LDS `. + unit: Percent + Insufficient CU Barriers: + rst: The percent of :ref:`total CU cycles ` in the kernel where + a workgroup could not be scheduled to a :doc:`CU ` due to lack + of available :ref:`barriers `. + unit: Percent + Reached CU Workgroup Limit: + rst: The percent of :ref:`total CU cycles ` in the kernel where + a workgroup could not be scheduled to a :doc:`CU ` due to limits + within the workgroup manager. This is expected to be always be zero on CDNA2 + or newer accelerators (and small for previous accelerators). + unit: Percent + Reached CU Wavefront Limit: + rst: The percent of :ref:`total CU cycles ` in the kernel where + a wavefront could not be scheduled to a :doc:`CU ` due to limits + within the workgroup manager. This is expected to be always be zero on CDNA2 + or newer accelerators (and small for previous accelerators). + unit: Percent +Command processor fetcher (CPF): + CPF Utilization: + rst: Percent of total cycles where the CPF was busy actively doing any work. The + ratio of CPF busy cycles over total cycles counted by the CPF. + unit: Percent + CPF Stall: + rst: Percent of CPF busy cycles where the CPF was stalled for any reason. + unit: Percent + CPF-L2 Utilization: + rst: Percent of total cycles counted by the CPF-:doc:`L2 ` interface where + the CPF-L2 interface was active doing any work. The ratio of CPF-L2 busy cycles + over total cycles counted by the CPF-L2. + unit: Percent + CPF-L2 Stall: + rst: Percent of CPF-:doc:`L2 ` L2 busy cycles where the CPF-L2 interface + was stalled for any reason. + unit: Percent + CPF-UTCL1 Stall: + rst: Percent of CPF busy cycles where the CPF was stalled by address translation. + unit: Percent + CPC Utilization: + rst: Percent of total cycles where the CPC was busy actively doing any work. The + ratio of CPC busy cycles over total cycles counted by the CPC. + unit: Percent + CPC Stall Rate: + rst: Percent of CPC busy cycles where the CPC was stalled for any reason. + unit: Percent + CPC Packet Decoding Utilization: + rst: Percent of CPC busy cycles spent decoding commands for processing. + unit: Percent + CPC-Workgroup Manager Utilization: + rst: Percent of CPC busy cycles spent dispatching workgroups to the :ref:`workgroup + manager `. + unit: Percent + CPC-L2 Utilization: + rst: Percent of total cycles counted by the CPC-:doc:`L2 ` interface where + the CPC-L2 interface was active doing any work. + unit: Percent + CPC-UTCL1 Stall: + rst: Percent of CPC busy cycles where the CPC was stalled by address translation + unit: Percent + CPC-UTCL2 Utilization: + rst: Percent of total cycles counted by the CPC's :doc:`L2 ` address translation + interface where the CPC was busy doing address translation work. + unit: Percent +Command processor packet processor (CPC): + CPF Utilization: + rst: Percent of total cycles where the CPF was busy actively doing any work. The + ratio of CPF busy cycles over total cycles counted by the CPF. + unit: Percent + CPF Stall: + rst: Percent of CPF busy cycles where the CPF was stalled for any reason. + unit: Percent + CPF-L2 Utilization: + rst: Percent of total cycles counted by the CPF-:doc:`L2 ` interface where + the CPF-L2 interface was active doing any work. The ratio of CPF-L2 busy cycles + over total cycles counted by the CPF-L2. + unit: Percent + CPF-L2 Stall: + rst: Percent of CPF-:doc:`L2 ` L2 busy cycles where the CPF-L2 interface + was stalled for any reason. + unit: Percent + CPF-UTCL1 Stall: + rst: Percent of CPF busy cycles where the CPF was stalled by address translation. + unit: Percent + CPC Utilization: + rst: Percent of total cycles where the CPC was busy actively doing any work. The + ratio of CPC busy cycles over total cycles counted by the CPC. + unit: Percent + CPC Stall Rate: + rst: Percent of CPC busy cycles where the CPC was stalled for any reason. + unit: Percent + CPC Packet Decoding Utilization: + rst: Percent of CPC busy cycles spent decoding commands for processing. + unit: Percent + CPC-Workgroup Manager Utilization: + rst: Percent of CPC busy cycles spent dispatching workgroups to the :ref:`workgroup + manager `. + unit: Percent + CPC-L2 Utilization: + rst: Percent of total cycles counted by the CPC-:doc:`L2 ` interface where + the CPC-L2 interface was active doing any work. + unit: Percent + CPC-UTCL1 Stall: + rst: Percent of CPC busy cycles where the CPC was stalled by address translation + unit: Percent + CPC-UTCL2 Utilization: + rst: Percent of total cycles counted by the CPC's :doc:`L2 ` address translation + interface where the CPC was busy doing address translation work. + unit: Percent +System Speed-of-Light: + VALU FLOPs: + rst: 'The total floating-point operations executed per second on the :ref:`VALU + `. This is also presented as a percent of the peak theoretical FLOPs + achievable on the specific accelerator. Note: this does not include any floating-point + operations from :ref:`MFMA ` instructions.' + unit: GFLOPs + VALU IOPs: + rst: 'The total integer operations executed per second on the :ref:`VALU `. + This is also presented as a percent of the peak theoretical IOPs achievable + on the specific accelerator. Note: this does not include any integer operations + from :ref:`MFMA ` instructions.' + unit: GOIPs + MFMA FLOPs (F8): + rst: 'The total number of 8-bit brain floating point :ref:`MFMA ` operations + executed per second. Note: this does not include any 16-bit brain floating point + operations from :ref:`VALU ` instructions. This is also presented + as a percent of the peak theoretical F8 MFMA operations achievable on the specific + accelerator. It is supported on AMD Instinct MI300 series and later only.' + unit: GFLOPs + MFMA FLOPs (BF16): + rst: 'The total number of 16-bit brain floating point :ref:`MFMA ` + operations executed per second. Note: this does not include any 16-bit brain + floating point operations from :ref:`VALU ` instructions. This is + also presented as a percent of the peak theoretical BF16 MFMA operations achievable + on the specific accelerator.' + unit: GFLOPs + MFMA FLOPs (F16): + rst: 'The total number of 16-bit floating point :ref:`MFMA ` operations + executed per second. Note: this does not include any 16-bit floating point operations + from :ref:`VALU ` instructions. This is also presented as a percent + of the peak theoretical F16 MFMA operations achievable on the specific accelerator.' + unit: GFLOPs + MFMA FLOPs (F32): + rst: 'The total number of 32-bit floating point :ref:`MFMA ` operations + executed per second. Note: this does not include any 32-bit floating point operations + from :ref:`VALU ` instructions. This is also presented as a percent + of the peak theoretical F32 MFMA operations achievable on the specific accelerator.' + unit: GFLOPs + MFMA FLOPs (F64): + rst: 'The total number of 64-bit floating point :ref:`MFMA ` operations + executed per second. Note: this does not include any 64-bit floating point operations + from :ref:`VALU ` instructions. This is also presented as a percent + of the peak theoretical F64 MFMA operations achievable on the specific accelerator.' + unit: GFLOPs + MFMA IOPs (Int8): + rst: 'The total number of 8-bit integer :ref:`MFMA ` operations executed + per second. Note: this does not include any 8-bit integer operations from :ref:`VALU + ` instructions. This is also presented as a percent of the peak theoretical + INT8 MFMA operations achievable on the specific accelerator.' + unit: GIOPs + Active CUs: + rst: Total number of active compute units (CUs) on the accelerator during the + kernel execution. + unit: Number + SALU Utilization: + rst: Indicates what percent of the kernel's duration the :ref:`SALU ` + was busy executing instructions. Computed as the ratio of the total number of + cycles spent by the :ref:`scheduler ` issuing SALU / :ref:`SMEM + ` instructions over the :ref:`total CU cycles `. + unit: Percent + VALU Utilization: + rst: Indicates what percent of the kernel's duration the :ref:`VALU ` + was busy executing instructions. Does not include :ref:`VMEM ` operations. + Computed as the ratio of the total number of cycles spent by the :ref:`scheduler + ` issuing VALU instructions over the :ref:`total CU cycles `. + unit: Percent + MFMA Utilization: + rst: Indicates what percent of the kernel's duration the :ref:`MFMA ` + unit was busy executing instructions. Computed as the ratio of the total number + of cycles spent by the :ref:`MFMA ` was busy over the :ref:`total + CU cycles `. + unit: Percent + VMEM Utilization: + rst: Indicates what percent of the kernel's duration the :ref:`VMEM ` + unit was busy executing instructions, including both global/generic and spill/scratch + operations (see the :ref:`VMEM instruction count metrics ` + for more detail). Does not include :ref:`VALU ` operations. Computed + as the ratio of the total number of cycles spent by the :ref:`scheduler ` + issuing VMEM instructions over the :ref:`total CU cycles `. + unit: Percent + Branch Utilization: + rst: Indicates what percent of the kernel's duration the :ref:`branch ` + unit was busy executing instructions. Computed as the ratio of the total number + of cycles spent by the :ref:`scheduler ` issuing branch instructions + over the :ref:`total CU cycles `. + unit: Percent + VALU Active Threads: + rst: Indicates the average level of :ref:`divergence ` within + a wavefront over the lifetime of the kernel. The number of work-items that were + active in a wavefront during execution of each :ref:`VALU ` instruction, + time-averaged over all VALU instructions run on all wavefronts in the kernel. + unit: Work-items + IPC: + rst: The ratio of the total number of instructions executed on the :doc:`CU ` + over the :ref:`total active CU cycles `. + unit: Instructions per-cycle + Wavefront Occupancy: + rst: 'The time-averaged number of wavefronts resident on the accelerator over + the lifetime of the kernel. Note: this metric may be inaccurate for short-running + kernels (less than 1ms). This is also presented as a percent of the peak theoretical + occupancy achievable on the specific accelerator.' + unit: Wavefronts + Theoretical LDS Bandwidth: + rst: Indicates the maximum amount of bytes that could have been loaded from, stored + to, or atomically updated in the LDS per unit time (see :ref:`LDS Bandwidth + ` example for more detail). This is also presented as a percent + of the peak theoretical F64 MFMA operations achievable on the specific accelerator. + unit: GB/s + LDS Bank Conflicts/Access: + rst: The ratio of the number of cycles spent in the :doc:`LDS scheduler ` + due to bank conflicts (as determined by the conflict resolution hardware) to + the base number of cycles that would be spent in the LDS scheduler in a completely uncontended + case. This is also presented in normalized form (i.e., the Bank Conflict Rate). + unit: Conflicts/Access + vL1D Cache Hit Rate: + rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache + over the total number of cache line requests to the :ref:`vL1D cache RAM `. + unit: Percent + vL1D Cache BW: + rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM + ` instructions per unit time. The number of bytes is calculated + as the number of cache lines requested multiplied by the cache line size. This + value does not consider partial requests, so e.g., if only a single value is + requested in a cache line, the data movement will still be counted as a full + cache line. This is also presented as a percent of the peak theoretical bandwidth + achievable on the specific accelerator. + unit: GB/s + L2 Cache Hit Rate: + rst: The ratio of the number of L2 cache line requests that hit in the L2 cache + over the total number of incoming cache line requests to the L2 cache. + unit: Percent + L2 Cache BW: + rst: The number of bytes looked up in the L2 cache per unit time. The number of + bytes is calculated as the number of cache lines requested multiplied by the + cache line size. This value does not consider partial requests, so e.g., if + only a single value is requested in a cache line, the data movement will still + be counted as a full cache line. This is also presented as a percent of the + peak theoretical bandwidth achievable on the specific accelerator. + unit: GB/s + L2-Fabric Read BW: + rst: "The number of bytes read by the L2 over the :ref:`Infinity Fabric\u2122\ + \ interface ` per unit time. This is also presented as a percent\ + \ of the peak theoretical bandwidth achievable on the specific accelerator." + unit: GB/s + L2-Fabric Write BW: + rst: The number of bytes sent by the L2 over the :ref:`Infinity Fabric interface + ` by write and atomic operations per unit time. This is also presented + as a percent of the peak theoretical bandwidth achievable on the specific accelerator. + unit: GB/s + L2-Fabric Read Latency: + rst: The time-averaged number of cycles read requests spent in Infinity Fabric before + data was returned to the L2. + unit: Cycles + L2-Fabric Write Latency: + rst: The time-averaged number of cycles write requests spent in Infinity Fabric + before a completion acknowledgement was returned to the L2. + unit: Cycles + sL1D Cache Hit Rate: + rst: The percent of sL1D requests that hit on a previously loaded line the cache. + Calculated as the ratio of the number of sL1D requests that hit over the number + of all sL1D requests. + unit: Percent + sL1D Cache BW: + rst: The number of bytes looked up in the sL1D cache per unit time. This is also + presented as a percent of the peak theoretical bandwidth achievable on the + specific accelerator. + unit: GB/s + L1I Hit Rate: + rst: The percent of L1I requests that hit on a previously loaded line the cache. + Calculated as the ratio of the number of L1I requests that hit over the number + of all L1I requests. + unit: GB/s + L1I BW: + rst: The number of bytes looked up in the L1I cache per unit time. This is also + presented as a percent of the peak theoretical bandwidth achievable on the + specific accelerator. + unit: Percent + L1I Fetch Latency: + rst: The average number of cycles spent to fetch instructions to a :doc:`CU `. + unit: Cycles diff --git a/projects/rocprofiler-compute/docs/how-to/analyze/cli.rst b/projects/rocprofiler-compute/docs/how-to/analyze/cli.rst index d923b0426d..2c494840e3 100644 --- a/projects/rocprofiler-compute/docs/how-to/analyze/cli.rst +++ b/projects/rocprofiler-compute/docs/how-to/analyze/cli.rst @@ -242,6 +242,11 @@ List metrics $ rocprof-compute analyze -p workloads/vcopy/MI200/ --list-metrics gfx90a +Show Description column which is excluded by default in cli output + .. code-block:: shell + + $ rocprof-compute analyze -p workloads/vcopy/MI200/ --list-metrics gfx90a --include-cols Description + Show System Speed-of-Light and CS_Busy blocks only .. code-block:: shell diff --git a/projects/rocprofiler-compute/docs/sphinx/requirements.in b/projects/rocprofiler-compute/docs/sphinx/requirements.in index fd7e1ddb15..5c9ea61e34 100644 --- a/projects/rocprofiler-compute/docs/sphinx/requirements.in +++ b/projects/rocprofiler-compute/docs/sphinx/requirements.in @@ -1,2 +1,3 @@ rocm-docs-core==1.21.1 sphinxcontrib.datatemplates==0.11.0 +sphinx-jinja==2.0.2 diff --git a/projects/rocprofiler-compute/docs/sphinx/requirements.txt b/projects/rocprofiler-compute/docs/sphinx/requirements.txt index aa1a564d9a..03c40b8071 100644 --- a/projects/rocprofiler-compute/docs/sphinx/requirements.txt +++ b/projects/rocprofiler-compute/docs/sphinx/requirements.txt @@ -53,7 +53,8 @@ docutils==0.21.2 # myst-parser # pydata-sphinx-theme # sphinx -exceptiongroup==1.2.2 + # sphinx-jinja +exceptiongroup==1.3.0 # via ipython executing==2.2.0 # via stack-data @@ -87,6 +88,7 @@ jinja2==3.1.5 # via # myst-parser # sphinx + # sphinx-jinja jsonschema==4.23.0 # via nbformat jsonschema-specifications==2024.10.1 @@ -215,6 +217,7 @@ sphinx==8.1.3 # sphinx-copybutton # sphinx-design # sphinx-external-toc + # sphinx-jinja # sphinx-notfound-page # sphinxcontrib-datatemplates # sphinxcontrib-runcmd @@ -226,6 +229,8 @@ sphinx-design==0.6.1 # via rocm-docs-core sphinx-external-toc==1.0.1 # via rocm-docs-core +sphinx-jinja==2.0.2 + # via -r requirements.in sphinx-notfound-page==1.0.4 # via rocm-docs-core sphinxcontrib-applehelp==2.0.0 @@ -268,6 +273,7 @@ traitlets==5.14.3 # nbformat typing-extensions==4.12.2 # via + # exceptiongroup # ipython # myst-nb # pydata-sphinx-theme diff --git a/projects/rocprofiler-compute/src/argparser.py b/projects/rocprofiler-compute/src/argparser.py index 0e19b5b8f4..a475ec4214 100644 --- a/projects/rocprofiler-compute/src/argparser.py +++ b/projects/rocprofiler-compute/src/argparser.py @@ -202,7 +202,7 @@ Examples: nargs="?", const="", # Argument to --list-metrics is optional - choices=[""] + list(supported_archs.keys()), # ["gfx906", "gfx908", "gfx90a"], + choices=[""] + list(supported_archs.keys()), # ["gfx908", "gfx90a"], help=print_avail_arch(supported_archs.keys()), ) profile_group.add_argument( @@ -623,7 +623,18 @@ Examples: dest="cols", metavar="", nargs="+", - help="\t\tSpecify column indices to display.", + help="\t\tSpecify column indices to display.\n\t\tDefaults to display all columns.", + ) + analyze_advanced_group.add_argument( + "--include-cols", + dest="include_cols", + metavar="", + nargs="+", + help=( + "\t\tSpecify which hidden column names should be included in cli output.\n" + "\t\tFor example, to show 'Description' column which is hidden by default in cli output,\n" + "\t\tuse the option --include-cols Description." + ), ) analyze_advanced_group.add_argument( "-g", dest="debug", action="store_true", help="\t\tDebug single metric." diff --git a/projects/rocprofiler-compute/src/config.py b/projects/rocprofiler-compute/src/config.py index 8d27530e38..5cb8b279cf 100644 --- a/projects/rocprofiler-compute/src/config.py +++ b/projects/rocprofiler-compute/src/config.py @@ -28,7 +28,8 @@ from pathlib import Path rocprof_compute_home = Path(__file__).resolve().parent PROJECT_NAME = "rocprofiler-compute" -HIDDEN_COLUMNS = ["Tips", "coll_level"] +HIDDEN_COLUMNS = ["coll_level"] +HIDDEN_COLUMNS_CLI = ["Description", "coll_level"] HIDDEN_SECTIONS = [400, 1900, 2000] TIME_UNITS = {"s": 10**9, "ms": 10**6, "us": 10**3, "ns": 1} diff --git a/projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_base.py b/projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_base.py index d1aa76724b..bba8ce3ebb 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_base.py +++ b/projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_base.py @@ -25,6 +25,7 @@ import copy import os import sys +import textwrap from abc import ABC, abstractmethod from collections import OrderedDict from pathlib import Path @@ -96,15 +97,28 @@ class OmniAnalyze_Base: sys_info.iloc[0], ) + metric_descriptions = { + k: v + for dfs in self._arch_configs[args.list_metrics].dfs.values() + for k, v in dfs.to_dict().get("Description", {}).items() + } for key, value in self._arch_configs[args.list_metrics].metric_list.items(): prefix = "" + description = "" if "." not in str(key): prefix = "" elif str(key).count(".") == 1: prefix = "\t" else: prefix = "\t\t" - print(prefix + key, "->", value) + description = metric_descriptions.get(key, "") + print(prefix + key, "->", value + "\n") + if description: + print( + prefix + + f"\n{prefix}".join(textwrap.wrap(description, width=40)) + + "\n" + ) sys.exit(0) else: console_error("Unsupported arch") diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0000_top_stats.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0000_top_stats.yaml index ccf1309850..55c6f6bb24 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0000_top_stats.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0000_top_stats.yaml @@ -1,14 +1,14 @@ ---- +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py Panel Config: - id: 000 + id: 0 title: Top Stats + metrics_description: {} data source: - - raw_csv_table: - id: 001 - title: Top Kernels - source: pmc_kernel_top.csv - - - raw_csv_table: - id: 002 - title: Dispatch List - source: pmc_dispatch_info.csv + - raw_csv_table: + id: 1 + title: Top Kernels + source: pmc_kernel_top.csv + - raw_csv_table: + id: 2 + title: Dispatch List + source: pmc_dispatch_info.csv diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0100_system_info.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0100_system_info.yaml index b7ec29eaf9..8470ffbbe3 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0100_system_info.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0100_system_info.yaml @@ -1,9 +1,10 @@ ---- +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py Panel Config: id: 100 title: System Info + metrics_description: {} data source: - - raw_csv_table: - id: 101 - source: sysinfo.csv - columnwise: True + - raw_csv_table: + id: 101 + source: sysinfo.csv + columnwise: true diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0200_system-speed-of-light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0200_system-speed-of-light.yaml deleted file mode 100644 index 2586d5bab1..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0200_system-speed-of-light.yaml +++ /dev/null @@ -1,236 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - SALU: &SALU_anchor Scalar Arithmetic Logic Unit - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 200 - title: System Speed-of-Light - data source: - - metric_table: - id: 201 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - peak: Peak - pop: Pct of Peak - tips: Tips - metric: - VALU FLOPs: - value: None # No perf counter - unit: GFLOP/s - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: None # No perf counter - tips: - VALU IOPs: - value: None # No perf counter - unit: GIOP/s - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: None # No perf counter - tips: - MFMA FLOPs (BF16): - value: None # No perf counter - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 512) / 1000) - pop: None # No perf counter - tips: - MFMA FLOPs (F16): - value: None # No perf counter - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000) - pop: None # No perf counter - tips: - MFMA FLOPs (F32): - value: None # No perf counter - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: None # No perf counter - tips: - MFMA FLOPs (F64): - value: None # No perf counter - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: None # No perf counter - tips: - MFMA IOPs (Int8): - value: None # No perf counter - unit: GIOP/s - peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000) - pop: None # No perf counter - tips: - Active CUs: - value: $numActiveCUs - unit: CUs - peak: $cu_per_gpu - pop: ((100 * $numActiveCUs) / $cu_per_gpu) - tips: - SALU Utilization: - value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - peak: 100 - pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - tips: - VALU Utilization: - value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - peak: 100 - pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - tips: - MFMA Utilization: - value: None # No HW module - unit: pct - peak: 100 - pop: None # No HW module - tips: - VMEM Utilization: - value: None # No HW module - unit: pct - peak: 100 - pop: None # No HW module - tips: - Branch Utilization: - value: None # No HW module - unit: pct - peak: 100 - pop: None # No HW module - tips: - VALU Active Threads: - value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - unit: Threads - peak: $wave_size - pop: (100 * AVG((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU / $wave_size) if (SQ_ACTIVE_INST_VALU != 0) else None)) - tips: - IPC: - value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - unit: Instr/cycle - peak: 5 - pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5) - tips: - Wavefront Occupancy: - value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - unit: Wavefronts - peak: ($max_waves_per_cu * $cu_per_gpu) - pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu - * $cu_per_gpu)))) - coll_level: SQ_LEVEL_WAVES - tips: - Theoretical LDS Bandwidth: - value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: (($max_sclk * $cu_per_gpu) * 0.128) - pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * 0.00128))) - tips: - LDS Bank Conflicts/Access: - value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - unit: Conflicts/access - peak: 32 - pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) / 32) - tips: - vL1D Cache Hit Rate: - value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - unit: pct - peak: 100 - pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / - TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - tips: - vL1D Cache BW: - value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: ((($max_sclk / 1000) * 64) * $cu_per_gpu) - pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk / 1000) * 64) * $cu_per_gpu)) - tips: - L2 Cache Hit Rate: - value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - unit: pct - peak: 100 - pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - tips: - L2 Cache BW: - value: AVG(((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan)) - pop: ((100 * AVG(((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan))) - tips: - L2-Fabric Read BW: - value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) - * 64)) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: $hbmBandwidth - pop: ((100 * AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) - * 64)) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth) - tips: - L2-Fabric Write BW: - value: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: $hbmBandwidth - pop: ((100 * AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth) - tips: - L2-Fabric Read Latency: - value: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum - != 0) else None)) - unit: Cycles - peak: None - pop: None - tips: - L2-Fabric Write Latency: - value: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum - != 0) else None)) - unit: Cycles - peak: None - pop: None - tips: - sL1D Cache Hit Rate: - value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) - if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) - unit: pct - peak: 100 - pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) - if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) - tips: - sL1D Cache BW: - value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) - unit: GB/s - peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) - pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))) / ((($max_sclk - / 1000) * 64) * $sqc_per_gpu)) - tips: - L1I Hit Rate: - value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) - unit: pct - peak: 100 - pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) - tips: - L1I BW: - value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) - unit: GB/s - peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) - pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))) / ((($max_sclk - / 1000) * 64) * $sqc_per_gpu)) - tips: - L1I Fetch Latency: - value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - unit: Cycles - peak: None - pop: None - coll_level: SQ_IFETCH_LEVEL - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0200_system_speed_of_light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0200_system_speed_of_light.yaml new file mode 100644 index 0000000000..3c3a8097f4 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0200_system_speed_of_light.yaml @@ -0,0 +1,317 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 200 + title: System Speed-of-Light + metrics_description: + VALU FLOPs: 'The total floating-point operations executed per second on the VALU. + This is also presented as a percent of the peak theoretical FLOPs achievable + on the specific accelerator. Note: this does not include any floating-point + operations from MFMA instructions.' + VALU IOPs: 'The total integer operations executed per second on the VALU. This + is also presented as a percent of the peak theoretical IOPs achievable on the + specific accelerator. Note: this does not include any integer operations from + MFMA instructions.' + MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations + executed per second. This does not include any 16-bit brain floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F8 MFMA operations achievable on the specific accelerator. It is supported on + AMD Instinct MI300 series and later only. + MFMA FLOPs (BF16): 'The total number of 16-bit brain floating point MFMA operations + executed per second. Note: this does not include any 16-bit brain floating point + operations from VALU instructions. This is also presented as a percent of the + peak theoretical BF16 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F16): 'The total number of 16-bit floating point MFMA operations executed + per second. Note: this does not include any 16-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F16 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F32): 'The total number of 32-bit floating point MFMA operations executed + per second. Note: this does not include any 32-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F32 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F64): 'The total number of 64-bit floating point MFMA operations executed + per second. Note: this does not include any 64-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F64 MFMA operations achievable on the specific accelerator.' + MFMA IOPs (Int8): 'The total number of 8-bit integer MFMA operations executed + per second. Note: this does not include any 8-bit integer operations from VALU + instructions. This is also presented as a percent of the peak theoretical INT8 + MFMA operations achievable on the specific accelerator.' + Active CUs: Total number of active compute units (CUs) on the accelerator during + the kernel execution. + SALU Utilization: Indicates what percent of the kernel's duration the SALU was + busy executing instructions. Computed as the ratio of the total number of cycles + spent by the scheduler issuing SALU or SMEM instructions over the total CU cycles. + VALU Utilization: Indicates what percent of the kernel's duration the VALU was + busy executing instructions. Does not include VMEM operations. Computed as the + ratio of the total number of cycles spent by the scheduler issuing VALU instructions + over the total CU cycles. + MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit + was busy executing instructions. Computed as the ratio of the total number of + cycles the MFMA was busy over the total CU cycles. + VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit + was busy executing instructions, including both global/generic and spill/scratch + operations (see the VMEM instruction count metrics) for more detail). Does not + include VALU operations. Computed as the ratio of the total number of cycles + spent by the scheduler issuing VMEM instructions over the total CU cycles. + Branch Utilization: Indicates what percent of the kernel's duration the branch + unit was busy executing instructions. Computed as the ratio of the total number + of cycles spent by the scheduler issuing branch instructions over the total + CU cycles + VALU Active Threads: Indicates the average level of divergence within a wavefront + over the lifetime of the kernel. The number of work-items that were active in + a wavefront during execution of each VALU instruction, time-averaged over all + VALU instructions run on all wavefronts in the kernel. + IPC: The ratio of the total number of instructions executed on the CU over the + total active CU cycles. This is also presented as a percent of the peak theoretical + bandwidth achievable on the specific accelerator. + Wavefront Occupancy: 'The time-averaged number of wavefronts resident on the accelerator + over the lifetime of the kernel. Note: this metric may be inaccurate for short-running + kernels (less than 1ms). This is also presented as a percent of the peak theoretical + occupancy achievable on the specific accelerator.' + Theoretical LDS Bandwidth: Indicates the maximum amount of bytes that could have + been loaded from, stored to, or atomically updated in the LDS per unit time + (see LDS Bandwidth example for more detail). This is also presented as a percent + of the peak theoretical F64 MFMA operations achievable on the specific accelerator. + LDS Bank Conflicts/Access: The ratio of the number of cycles spent in the LDS + scheduler due to bank conflicts (as determined by the conflict resolution hardware) + to the base number of cycles that would be spent in the LDS scheduler in a completely + uncontended case. This is also presented in normalized form (i.e., the Bank + Conflict Rate). + vL1D Cache Hit Rate: The ratio of the number of vL1D cache line requests that + hit in vL1D cache over the total number of cache line requests to the vL1D cache + RAM. + vL1D Cache BW: The number of bytes looked up in the vL1D cache as a result of + VMEM instructions per unit time. The number of bytes is calculated as the number + of cache lines requested multiplied by the cache line size. This value does + not consider partial requests, so e.g., if only a single value is requested + in a cache line, the data movement will still be counted as a full cache line. + This is also presented as a percent of the peak theoretical bandwidth achievable + on the specific accelerator. + L2 Cache Hit Rate: The ratio of the number of L2 cache line requests that hit + in the L2 cache over the total number of incoming cache line requests to the + L2 cache. + L2 Cache BW: The number of bytes looked up in the L2 cache per unit time. The + number of bytes is calculated as the number of cache lines requested multiplied + by the cache line size. This value does not consider partial requests, so e.g., + if only a single value is requested in a cache line, the data movement will + still be counted as a full cache line. This is also presented as a percent of + the peak theoretical bandwidth achievable on the specific accelerator. + L2-Fabric Read BW: "The number of bytes read by the L2 over the Infinity Fabric\u2122\ + \ interface per unit time. This is also presented as a percent of the peak theoretical\ + \ bandwidth achievable on the specific accelerator." + L2-Fabric Write BW: The number of bytes sent by the L2 over the Infinity Fabric + interface by write and atomic operations per unit time. This is also presented + as a percent of the peak theoretical bandwidth achievable on the specific accelerator. + L2-Fabric Read Latency: The time-averaged number of cycles read requests spent + in Infinity Fabric before data was returned to the L2. + L2-Fabric Write Latency: The time-averaged number of cycles write requests spent + in Infinity Fabric before a completion acknowledgement was returned to the L2. + sL1D Cache Hit Rate: The percent of sL1D requests that hit on a previously loaded + line the cache. Calculated as the ratio of the number of sL1D requests that + hit over the number of all sL1D requests. + sL1D Cache BW: The number of bytes looked up in the sL1D cache per unit time. + This is also presented as a percent of the peak theoretical bandwidth achievable + on the specific accelerator. + L1I Hit Rate: The number of bytes looked up in the L1I cache per unit time. This + is also presented as a percent of the peak theoretical bandwidth achievable + on the specific accelerator. + L1I BW: The percent of L1I requests that hit on a previously loaded line the cache. + Calculated as the ratio of the number of L1I requests that hit over the number + of all L1I requests. + L1I Fetch Latency: The average number of cycles spent to fetch instructions to + a CU. + data source: + - metric_table: + id: 201 + title: System Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + peak: Peak + pop: Pct of Peak + metric: + VALU FLOPs: + value: None + unit: GFLOP/s + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: None + VALU IOPs: + value: None + unit: GIOP/s + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: None + MFMA FLOPs (BF16): + value: None + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 512) / 1000) + pop: None + MFMA FLOPs (F16): + value: None + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000) + pop: None + MFMA FLOPs (F32): + value: None + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: None + MFMA FLOPs (F64): + value: None + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: None + MFMA IOPs (Int8): + value: None + unit: GIOP/s + peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000) + pop: None + Active CUs: + value: $numActiveCUs + unit: CUs + peak: $cu_per_gpu + pop: ((100 * $numActiveCUs) / $cu_per_gpu) + SALU Utilization: + value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + VALU Utilization: + value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + MFMA Utilization: + value: None + unit: pct + peak: 100 + pop: None + VMEM Utilization: + value: None + unit: pct + peak: 100 + pop: None + Branch Utilization: + value: None + unit: pct + peak: 100 + pop: None + VALU Active Threads: + value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + unit: Threads + peak: $wave_size + pop: (100 * AVG((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU / $wave_size) + if (SQ_ACTIVE_INST_VALU != 0) else None)) + IPC: + value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + unit: Instr/cycle + peak: 5 + pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5) + Wavefront Occupancy: + value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + unit: Wavefronts + peak: ($max_waves_per_cu * $cu_per_gpu) + pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu + * $cu_per_gpu)))) + coll_level: SQ_LEVEL_WAVES + Theoretical LDS Bandwidth: + value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: (($max_sclk * $cu_per_gpu) * 0.128) + pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * 0.00128))) + LDS Bank Conflicts/Access: + value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + unit: Conflicts/access + peak: 32 + pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) / 32) + vL1D Cache Hit Rate: + value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None)) + unit: pct + peak: 100 + pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None)) + vL1D Cache BW: + value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: ((($max_sclk / 1000) * 64) * $cu_per_gpu) + pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk / 1000) * 64) * $cu_per_gpu)) + L2 Cache Hit Rate: + value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + unit: pct + peak: 100 + pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + L2 Cache BW: + value: AVG(((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan)) + pop: ((100 * AVG(((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp)))) + / ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan))) + L2-Fabric Read BW: + value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) + * 64)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: $hbmBandwidth + pop: ((100 * AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) + * 64)) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth) + L2-Fabric Write BW: + value: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) + * 32)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: $hbmBandwidth + pop: ((100 * AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) + * 32)) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth) + L2-Fabric Read Latency: + value: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + unit: Cycles + peak: None + pop: None + L2-Fabric Write Latency: + value: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + unit: Cycles + peak: None + pop: None + sL1D Cache Hit Rate: + value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) + if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) + unit: pct + peak: 100 + pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) + if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) + sL1D Cache BW: + value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) + unit: GB/s + peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) + pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * + 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) + L1I Hit Rate: + value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) + unit: pct + peak: 100 + pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) + L1I BW: + value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) + unit: GB/s + peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) + pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * + 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) + L1I Fetch Latency: + value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + unit: Cycles + peak: None + pop: None + coll_level: SQ_IFETCH_LEVEL diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0300_mem_chart.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0300_mem_chart.yaml deleted file mode 100644 index 8ff885f13f..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0300_mem_chart.yaml +++ /dev/null @@ -1,310 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 300 - title: Memory Chart - data source: - - metric_table: - id: 301 - title: Memory Chart - header: - metric: Metric - #alias: #alias - value: Value - tips: Tips - metric: - # ---------------------------------------- - # Instr Buff Block - - #TODO: double check wave_occupancy - Wavefront Occupancy: - #alias: wave_occ_ - value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs), 0) - coll_level: SQ_LEVEL_WAVES - tips: - Wave Life: - #alias: wave_life_ - value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0) else 0)), 0) - tips: - - # ---------------------------------------- - # Instr Dispatch Block - SALU: - #alias: salu_ - value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0) - tips: - SMEM: - #alias: smem_ - value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0) - tips: - VALU: - #alias: valu_ - value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0) - tips: - VMEM: - #alias: vmem_ - value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0) - tips: - LDS: - #alias: lds_ - value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) - tips: - GWS: - #alias: gws_ - value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0) - tips: - BR: - #alias: br_ - value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0) - tips: - - # ---------------------------------------- - # Exec Block - Active CUs: - #alias: active_cu_ - value: $numActiveCUs - tips: - Num CUs: - #alias: num_cu_ - value: $cu_per_gpu - tips: - VGPR: - #alias: vgpr_ - value: ROUND(AVG(Arch_VGPR), 0) - tips: - SGPR: - #alias: sgpr_ - value: ROUND(AVG(SGPR), 0) - tips: - LDS Allocation: - #alias: lds_alloc_ - value: ROUND(AVG(LDS_Per_Workgroup), 0) - tips: - Scratch Allocation: - #alias: scratch_alloc_ - value: ROUND(AVG(Scratch_Per_Workitem), 0) - tips: - Wavefronts: - #alias: wavefronts_ - value: ROUND(AVG(SPI_CSN_WAVE), 0) - tips: - Workgroups: - #alias: workgroups_ - value: ROUND(AVG(SPI_CSN_NUM_THREADGROUPS), 0) - tips: - - # ---------------------------------------- - # LDS Block - LDS Req: - #alias: lds_req_ - value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) - tips: - LDS Util: - #alias: lds_util_ - value: - ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))), - 0) - tips: - LDS Latency: - #alias: lds_lat - value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)),0) - coll_level: SQ_INST_LEVEL_LDS - tips: - - # ---------------------------------------- - # Vector L1 Cache Block - VL1 Rd: - #alias: vl1_rd_ - value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0) - tips: - VL1 Wr: - #alias: vl1_wr_ - value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0) - tips: - VL1 Atomic: - #alias: vl1_atom_ - value: - ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)), 0) - tips: - - VL1 Hit: - #alias: vl1_hit_ - value: - ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None )), 0) - tips: - VL1 Lat: - #alias: vl1_lat_ - value: - ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum - != 0) else None)), 0) - tips: - VL1 Coalesce: - #alias: vl1_coales_ - value: - ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum - * 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0) - tips: - VL1 Stall: - #alias: vl1_stall_ - value: - ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None)), 0) - tips: - - VL1_L2 Rd: - #alias: vl1_l2_rd_ - value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0) - tips: - VL1_L2 Wr: - #alias: vl1_l2_wr_ - value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0) - tips: - VL1_L2 Atomic: - #alias: vl1_l2_atom_ - value: - ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)), 0) - tips: - - # ---------------------------------------- - # Scalar L1D Cache Block - VL1D Rd: - #alias: sl1_rd_ - value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0) - tips: - VL1D Hit: - #alias: sl1_hit_ - value: - ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ != - 0) else None)) * 100), 0) - tips: - VL1D Lat: - #alias: sl1_lat_ - value: - ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ != - 0) else None)) * 100), 0) - coll_level: SQC_DCACHE_INFLIGHT_LEVEL - tips: - - VL1D_L2 Rd: - #alias: sl1_l2_rd_ - value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0) - tips: - VL1D_L2 Wr: - #alias: sl1_l2_wr_ - value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0) - tips: - VL1D_L2 Atomic: - #alias: sl1_l2_atom_ - value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0) - tips: - - # ---------------------------------------- - # Instr L1 Cache Block - IL1 Fetch: - #alias: il1_fetch_ - value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0) - tips: - IL1 Hit: - #alias: il1_hit_ - value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0) - tips: - IL1 Lat: - #alias: il1_lat_ - value: - ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ != - 0) else None)) * 100), 0) - tips: # ??? coll_level: SQ_IFETCH_LEVEL - IL1_L2 Rd: - #alias: il1_l2_req_ - value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0) - tips: - - # ---------------------------------------- - # L2 Cache Block(inside) - L2 Rd: - #alias: l2_rd_ - value: ROUND(AVG((TCC_READ_sum / $denom)), 0) - tips: - L2 Wr: - #alias: l2_wr_ - value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0) - tips: - L2 Atomic: - #alias: l2_atom_ - value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0) - tips: - L2 Hit: - #alias: l2_hit_ - value: - ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else 0)), 0) - tips: - L2 Rd Lat: - #alias: l2_rd_lat_ - value: - ROUND(AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) - if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else None)), - 0) - tips: - L2 Wr Lat: - #alias: l2_wr_lat_ - value: - ROUND(AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + - TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - != 0) else None)), 0) - tips: - - # ---------------------------------------- - # Fabric Block - Fabric_L2 Rd: - #alias: l2_fabric_rd_ - value: ROUND(AVG((TCC_EA_RDREQ_sum / $denom)), 0) - tips: - Fabric_L2 Wr: - #alias: l2_fabric_wr_ - value: ROUND(AVG((TCC_EA_WRREQ_sum / $denom)), 0) - tips: - Fabric_L2 Atomic: - #alias: l2_fabric_atom_ - value: ROUND(AVG((TCC_EA_ATOMIC_sum / $denom)), 0) - tips: - - Fabric Rd Lat: - #alias: fabric_rd_lat_ - value: - ROUND(AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum - != 0) else 0)), 0) - tips: - Fabric Wr Lat: - #alias: fabric_wr_lat_ - value: - ROUND(AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum - != 0) else 0)), 0) - tips: - Fabric Atomic Lat: - #alias: fabric_atom_lat_ - value: - ROUND(AVG(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum - != 0) else 0)), 0) - tips: - - HBM Rd: - #alias: hbm_rd_ - value: ROUND(AVG((TCC_EA_RDREQ_DRAM_sum / $denom)), 0) - tips: - HBM Wr: - #alias: hbm_wr_ - value: ROUND(AVG((TCC_EA_WRREQ_DRAM_sum / $denom)), 0) - tips: - - comparable: false # for now - cli_style: mem_chart diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0300_memory_chart.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0300_memory_chart.yaml new file mode 100644 index 0000000000..2ac5ca10b4 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0300_memory_chart.yaml @@ -0,0 +1,267 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 300 + title: Memory Chart + metrics_description: + Wavefront Occupancy: Wavefronts per active CU. + Wave Life: Average number of cycles executing a wave. + SALU: Total Number of SALU (Scalar ALU) instructions issued per normalization + unit. + SMEM: Total number of SMEM (Scalar Memory Read) instructions issued normalization + unit. + VALU: The number of VALU (Vector ALU) instructions issued per normalization unit. + MFMA: Total number of MFMA (Matrix-Fused-Multiply-Add) instructions issued per + normalization unit. + VMEM: The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch + memory) per normalization unit. + LDS: The total number of LDS instructions (including, but not limited to, read/write/atomics + and HIP's __shfl instructions) executed per normalization unit. + GWS: Total number of GDS (global data sync) instructions issued per normalization + unit. + BR: Total number of BRANCH instructions issued per normalization unit. + Active CUs: Total number of active compute units (CUs) on the accelerator during + the kernel execution. + Num CUs: Total number of compute units (CUs) on the accelerator. + VGPR: 'The number of architected vector general-purpose registers allocated for + the kernel, see VALU. Note: this may not exactly match the number of VGPRs requested + by the compiler due to allocation granularity.' + SGPR: 'The number of scalar general-purpose registers allocated for the kernel, + see SALU. Note: this may not exactly match the number of SGPRs requested by + the compiler due to allocation granularity.' + LDS Allocation: 'The number of bytes of LDS memory (or, shared memory) allocated + for this kernel. Note: This may also be larger than what was requested at compile + time due to both allocation granularity and dynamic per-dispatch LDS allocations.' + Scratch Allocation: The number of bytes of scratch memory requested per work-item + for this kernel. Scratch memory is used for stack memory on the accelerator, + as well as for register spills and restores. + Wavefronts: The total number of wavefronts, summed over all workgroups, forming + this kernel launch. + Workgroups: The total number of workgroups forming this kernel launch. + LDS Req: The total number of LDS instructions (including, but not limited to, + read/write/atomics and HIP's __shfl instructions) executed per normalization + unit. + LDS Util: Indicates what percent of the kernel's duration the LDS was actively + executing instructions (including, but not limited to, load, store, atomic and + HIP's __shfl operations). Calculated as the ratio of the total number of cycles + LDS was active over the total CU cycles. + LDS Latency: The average number of round-trip cycles (i.e., from issue to data-return + / acknowledgment) required for an LDS instruction to complete. + VL1 Rd: The total number of incoming read requests from the address processing + unit after coalescing per normalization unit + VL1 Wr: The total number of incoming write requests from the address processing + unit after coalescing per normalization unit + VL1 Atomic: The total number of incoming atomic requests from the address processing + unit after coalescing per normalization unit + VL1 Hit: The ratio of the number of vL1D cache line requests that hit in vL1D + cache over the total number of cache line requests to the vL1D Cache RAM. + VL1 Lat: Calculated as the average number of cycles that a vL1D cache line request + spent in the vL1D cache pipeline. + VL1 Coalesce: Indicates how well memory instructions were coalesced by the address + processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). Calculated + as the average number of thread-requests generated per instruction divided by + the ideal number of thread-requests per instruction. + VL1 Stall: The ratio of the number of cycles where the vL1D is stalled waiting + to issue a request for data to the L2 cache divided by the number of cycles + where the vL1D is active. + VL1_L2 Rd: The number of read requests for a vL1D cache line that were not satisfied + by the vL1D and must be retrieved from the to the L2 Cache per normalization + unit. + VL1_L2 Wr: The number of write requests to a vL1D cache line that were sent through + the vL1D to the L2 cache, per normalization unit. + VL1_L2 Atomic: The number of atomic requests that are sent through the vL1D to + the L2 cache, per normalization unit. This includes requests for atomics with, + and without return. + sL1D Rd: The total number of requests, of any size or type, made to the sL1D per + normalization unit. + sL1D Hit: The total number of sL1D requests that hit on a previously loaded cache + line, per normalization unit. + sL1D_L2 Rd: The total number of read requests from sL1D to the L2, per normalization + unit. + sL1D_L2 Wr: The total number of write requests from sL1D to the L2, per normalization + unit. Typically unused on current CDNA accelerators. + sL1D_L2 Atomic: The total number of atomic requests from sL1D to the L2, per normalization + unit. Typically unused on current CDNA accelerators. + IL1 Fetch: The total number of requests made to the L1I per normalization-unit. + IL1 Hit: The percent of L1I requests that hit on a previously loaded line the + cache. Calculated as the ratio of the number of L1I requests that hit over the + number of all L1I requests. + IL1 Lat: The average number of cycles spent to fetch instructions to a CU. + IL1_L2 Rd: The total number of requests across the L1I - L2 interface per normalization-unit. + L2 Rd: The total number of read requests to the L2 from all clients. + L2 Wr: The total number of write requests to the L2 from all clients. + L2 Atomic: The total number of atomic requests (with and without return) to the + L2 from all clients. + L2 Hit: The ratio of the number of L2 cache line requests that hit in the L2 cache + over the total number of incoming cache line requests to the L2 cache. + L2 Rd Lat: Calculated as the average number of cycles that the vL1D cache took + to issue and receive read requests from the L2 Cache. This number also includes + requests for atomics with return values. + L2 Wr Lat: Calculated as the average number of cycles that the vL1D cache took + to issue and receive acknowledgement of a write request to the L2 Cache. This + number also includes requests for atomics without return values. + Fabric_L2 Rd: Number of L2 cache - Infinity Fabric read requests (either 32-byte + or 64-byte) summed over TCC instances per normalization unit. + Fabric_L2 Wr: Number of L2 cache - Infinity Fabric write requests (either 32-byte + or 64-byte) summed over TCC instances per normalization unit. + Fabric_L2 Atomic: Number of L2 cache - Infinity Fabric write requests (either + 32-byte or 64-byte) that are actually atomic requests summed over TCC instances + per normalization unit. + Fabric Rd Lat: The time-averaged number of cycles read requests spent in Infinity + Fabric before data was returned to the L2. + Fabric Wr Lat: The time-averaged number of cycles write requests spent in Infinity + Fabric before a completion acknowledgement was returned to the L2. + Fabric Atomic Lat: The time-averaged number of cycles atomic requests spent in + Infinity Fabric before a completion acknowledgement (atomic without return value) + or data (atomic with return value) was returned to the L2. + HBM Rd: The total number of L2 requests to Infinity Fabric to read 32B or 64B + of data from the accelerator's local HBM, per normalization unit. + HBM Wr: 'The total number of L2 requests to Infinity Fabric to write or atomically + update 32B or 64B of data in the accelerator''s local HBM, per normalization + unit. ' + data source: + - metric_table: + id: 301 + title: Memory Chart + header: + metric: Metric + value: Value + metric: + Wavefront Occupancy: + value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs), + 0) + coll_level: SQ_LEVEL_WAVES + Wave Life: + value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0) else + 0)), 0) + SALU: + value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0) + SMEM: + value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0) + VALU: + value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0) + MFMA: + value: None + VMEM: + value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0) + LDS: + value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) + GWS: + value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0) + BR: + value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0) + Active CUs: + value: $numActiveCUs + Num CUs: + value: $cu_per_gpu + VGPR: + value: ROUND(AVG(Arch_VGPR), 0) + SGPR: + value: ROUND(AVG(SGPR), 0) + LDS Allocation: + value: ROUND(AVG(LDS_Per_Workgroup), 0) + Scratch Allocation: + value: ROUND(AVG(Scratch_Per_Workitem), 0) + Wavefronts: + value: ROUND(AVG(SPI_CSN_WAVE), 0) + Workgroups: + value: ROUND(AVG(SPI_CSN_NUM_THREADGROUPS), 0) + LDS Req: + value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) + LDS Util: + value: ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))), 0) + LDS Latency: + value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS + != 0) else None)),0) + coll_level: SQ_INST_LEVEL_LDS + VL1 Rd: + value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0) + VL1 Wr: + value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0) + VL1 Atomic: + value: ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)), 0) + VL1 Hit: + value: ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None )), 0) + VL1 Lat: + value: ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum + != 0) else None)), 0) + VL1 Coalesce: + value: ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum + * 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0) + VL1 Stall: + value: ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None)), 0) + VL1_L2 Rd: + value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0) + VL1_L2 Wr: + value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0) + VL1_L2 Atomic: + value: ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)), 0) + sL1D Rd: + value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0) + sL1D Hit: + value: ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ + != 0) else None)) * 100), 0) + sL1D Lat: + value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ + != 0) else None)) * 100), 0) + coll_level: SQC_DCACHE_INFLIGHT_LEVEL + sL1D_L2 Rd: + value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0) + sL1D_L2 Wr: + value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0) + sL1D_L2 Atomic: + value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0) + IL1 Fetch: + value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0) + IL1 Hit: + value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0) + IL1 Lat: + value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ + != 0) else None)) * 100), 0) + IL1_L2 Rd: + value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0) + L2 Rd: + value: ROUND(AVG((TCC_READ_sum / $denom)), 0) + L2 Wr: + value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0) + L2 Atomic: + value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0) + L2 Hit: + value: ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if + ((TCC_HIT_sum + TCC_MISS_sum) != 0) else 0)), 0) + L2 Rd Lat: + value: ROUND(AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + != 0) else None)), 0) + L2 Wr Lat: + value: ROUND(AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + != 0) else None)), 0) + Fabric_L2 Rd: + value: ROUND(AVG((TCC_EA_RDREQ_sum / $denom)), 0) + Fabric_L2 Wr: + value: ROUND(AVG((TCC_EA_WRREQ_sum / $denom)), 0) + Fabric_L2 Atomic: + value: ROUND(AVG((TCC_EA_ATOMIC_sum / $denom)), 0) + Fabric Rd Lat: + value: ROUND(AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else 0)), 0) + Fabric Wr Lat: + value: ROUND(AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else 0)), 0) + Fabric Atomic Lat: + value: ROUND(AVG(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum + != 0) else 0)), 0) + HBM Rd: + value: ROUND(AVG((TCC_EA_RDREQ_DRAM_sum / $denom)), 0) + HBM Wr: + value: ROUND(AVG((TCC_EA_WRREQ_DRAM_sum / $denom)), 0) + comparable: false + cli_style: mem_chart + tui_style: mem_chart diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0400_roofline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0400_roofline.yaml new file mode 100644 index 0000000000..41c8bac547 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0400_roofline.yaml @@ -0,0 +1,9 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 400 + title: Roofline + metrics_description: {} + data source: + - None: + id: 401 + title: Roofline diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0500_command-processor.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0500_command-processor.yaml deleted file mode 100644 index 164b3552bf..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0500_command-processor.yaml +++ /dev/null @@ -1,135 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 500 - title: Command Processor (CPC/CPF) - data source: - - metric_table: - id: 501 - title: Command Processor Fetcher - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - CPF Utilization: - avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) - if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) - min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) - if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) - max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) - if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) - unit: pct - tips: - CPF Stall: - avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None)) - min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None)) - max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None)) - unit: pct - tips: - CPF-L2 Utilization: - avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) - if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) - min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) - if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) - max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) - if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) - unit: pct - tips: - CPF-L2 Stall: - avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY - != 0) else None)) - min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY - != 0) else None)) - max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY - != 0) else None)) - unit: pct - tips: - CPF-UTCL1 Stall: - avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None) - min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None) - max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None) - unit: pct - tips: - - - metric_table: - id: 502 - title: Packet Processor - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - CPC Utilization: - avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) - if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) - min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) - if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) - max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) - if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) - unit: pct - tips: - CPC Stall Rate: - avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - unit: pct - tips: - CPC Packet Decoding Utilization: - avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) - min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) - max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) - unit: pct - tips: - CPC-Workgroup Manager Utilization: - avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) - min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) - max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) - unit: Pct - tips: - CPC-L2 Utilization: - avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) - if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) - min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) - if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) - max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) - if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) - unit: pct - tips: - CPC-UTCL1 Stall: - avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None) - min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None) - max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None) - unit: pct - tips: - CPC-UTCL2 Utilization: - avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) - if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) - min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) - if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) - max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) - if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) - unit: pct - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0500_command_processor_cpc_cpf.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0500_command_processor_cpc_cpf.yaml new file mode 100644 index 0000000000..c4d2cabf52 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0500_command_processor_cpc_cpf.yaml @@ -0,0 +1,145 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 500 + title: Command Processor (CPC/CPF) + metrics_description: + CPF Utilization: Percent of total cycles where the CPF was busy actively doing + any work. The ratio of CPF busy cycles over total cycles counted by the CPF. + CPF Stall: Percent of CPF busy cycles where the CPF was stalled for any reason. + CPF-L2 Utilization: Percent of total cycles counted by the CPF-L2 interface where + the CPF-L2 interface was active doing any work. The ratio of CPF-L2 busy cycles + over total cycles counted by the CPF-L2. + CPF-L2 Stall: Percent of CPF-L2 L2 busy cycles where the CPF-L2 interface was + stalled for any reason. + CPF-UTCL1 Stall: Percent of CPF busy cycles where the CPF was stalled by address + translation. + CPC Utilization: Percent of total cycles where the CPC was busy actively doing + any work. The ratio of CPC busy cycles over total cycles counted by the CPC. + CPC Stall Rate: Percent of CPC busy cycles where the CPC was stalled for any reason. + CPC Packet Decoding Utilization: Percent of CPC busy cycles spent decoding commands + for processing. + CPC-Workgroup Manager Utilization: Percent of CPC busy cycles spent dispatching + workgroups to the workgroup manager. + CPC-L2 Utilization: Percent of total cycles counted by the CPC-L2 interface where + the CPC-L2 interface was active doing any work. + CPC-UTCL1 Stall: Percent of CPC busy cycles where the CPC was stalled by address + translation + CPC-UTCL2 Utilization: 'Percent of total cycles counted by the CPC''s L2 address + translation interface where the CPC was busy doing address translation work. ' + data source: + - metric_table: + id: 501 + title: Command processor fetcher (CPF) + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + CPF Utilization: + avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) + if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) + min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) + if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) + max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) + if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) + unit: pct + CPF Stall: + avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None)) + min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None)) + max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None)) + unit: pct + CPF-L2 Utilization: + avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) + if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) + min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) + if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) + max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) + if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) + unit: pct + CPF-L2 Stall: + avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY + != 0) else None)) + min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY + != 0) else None)) + max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY + != 0) else None)) + unit: pct + CPF-UTCL1 Stall: + avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) + if (CPF_CPF_STAT_BUSY != 0) else None) + min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) + if (CPF_CPF_STAT_BUSY != 0) else None) + max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) + if (CPF_CPF_STAT_BUSY != 0) else None) + unit: pct + - metric_table: + id: 502 + title: Command processor packet processor (CPC) + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + CPC Utilization: + avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) + if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) + min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) + if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) + max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) + if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) + unit: pct + CPC Stall Rate: + avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + unit: pct + CPC Packet Decoding Utilization: + avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + unit: pct + CPC-Workgroup Manager Utilization: + avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + unit: Pct + CPC-L2 Utilization: + avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) + if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) + min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) + if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) + max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) + if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) + unit: pct + CPC-UTCL1 Stall: + avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if + (CPC_CPC_STAT_BUSY != 0) else None) + min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if + (CPC_CPC_STAT_BUSY != 0) else None) + max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if + (CPC_CPC_STAT_BUSY != 0) else None) + unit: pct + CPC-UTCL2 Utilization: + avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) + if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) + min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) + if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) + max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) + if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) + unit: pct diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0600_shader-processor-input.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0600_shader-processor-input.yaml deleted file mode 100644 index c78c3645a0..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0600_shader-processor-input.yaml +++ /dev/null @@ -1,167 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 600 - title: Workgroup Manager (SPI) - data source: - - metric_table: - id: 601 - title: Workgroup Manager Utilizations - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Accelerator Utilization: - avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) - min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) - max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) - unit: Pct - tips: - Scheduler-Pipe Utilization: - avg: AVG(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - min: MIN(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - max: MAX(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - unit: Pct - tips: - Workgroup Manager Utilization: - avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) - min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) - max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) - unit: Pct - tips: - Shader Engine Utilization: - avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) - min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) - max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) - unit: Pct - tips: - SIMD Utilization: - avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Dispatched Workgroups: - avg: AVG(SPI_CSN_NUM_THREADGROUPS) - min: MIN(SPI_CSN_NUM_THREADGROUPS) - max: MAX(SPI_CSN_NUM_THREADGROUPS) - unit: Workgroups - tips: - Dispatched Wavefronts: - avg: AVG(SPI_CSN_WAVE) - min: MIN(SPI_CSN_WAVE) - max: MAX(SPI_CSN_WAVE) - unit: Wavefronts - tips: - VGPR Writes: - avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - unit: Cycles/wave - tips: - SGPR Writes: - avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - unit: Cycles/wave - tips: - - metric_table: - id: 602 - title: Workgroup Manager - Resource Allocation - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Not-scheduled Rate (Workgroup Manager): - avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None) - min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None) - max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None) - unit: Pct - tips: - Not-scheduled Rate (Scheduler-Pipe): - avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None) - min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None) - max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None) - unit: Pct - tips: - Scheduler-Pipe Stall Rate: - avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None)) - min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None)) - max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None)) - unit: Pct - tips: - Scratch Stall Rate: - avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - unit: Pct - tips: - Insufficient SIMD Waveslots: - avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Insufficient SIMD VGPRs: - avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Insufficient SIMD SGPRs: - avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Insufficient CU LDS: - avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Insufficient CU Barriers: - avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Reached CU Workgroup Limit: - avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Reached CU Wavefront Limit: - avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0600_workgroup_manager_spi.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0600_workgroup_manager_spi.yaml new file mode 100644 index 0000000000..f6bf13d8b8 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0600_workgroup_manager_spi.yaml @@ -0,0 +1,201 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 600 + title: Workgroup Manager (SPI) + metrics_description: + Accelerator Utilization: The percent of cycles in the kernel where the accelerator + was actively doing any work. + Scheduler-Pipe Utilization: The percent of total scheduler-pipe cycles in the + kernel where the scheduler-pipes were actively doing any work. + Workgroup Manager Utilization: The percent of cycles in the kernel where the workgroup + manager was actively doing any work. + Shader Engine Utilization: The percent of total shader engine cycles in the kernel + where any CU in a shader-engine was actively doing any work, normalized over + all shader-engines. Low values (e.g., << 100%) indicate that the accelerator + was not fully saturated by the kernel, or a potential load-imbalance issue. + SIMD Utilization: The percent of total SIMD cycles in the kernel where any SIMD + on a CU was actively doing any work, summed over all CUs. Low values (less than + 100%) indicate that the accelerator was not fully saturated by the kernel, or + a potential load-imbalance issue. + Dispatched Workgroups: The total number of workgroups forming this kernel launch. + Dispatched Wavefronts: The total number of wavefronts, summed over all workgroups, + forming this kernel launch. + VGPR Writes: The average number of cycles spent initializing VGPRs at wave creation. + SGPR Writes: The average number of cycles spent initializing SGPRs at wave creation. + Not-scheduled Rate (Workgroup Manager): The percent of total scheduler-pipe cycles + in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck + within the workgroup manager rather than a lack of a CU or SIMD with sufficient + resources. + Not-scheduled Rate (Scheduler-Pipe): 'The percent of total scheduler-pipe cycles + in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck + within the scheduler-pipes rather than a lack of a CU or SIMD with sufficient + resources. ' + Scheduler-Pipe Stall Rate: The percent of total scheduler-pipe cycles in the kernel + where a workgroup could not be scheduled to a CU due to occupancy limitations + (like a lack of a CU or SIMD with sufficient resources). + Scratch Stall Rate: The percent of total shader-engine cycles in the kernel where + a workgroup could not be scheduled to a CU due to lack of private (a.k.a., scratch) + memory slots. While this can reach up to 100%, note that the actual occupancy + limitations on a kernel using private memory are typically quite small (for + example, less than 1% of the total number of waves that can be scheduled to + an accelerator). + Insufficient SIMD Waveslots: The percent of total SIMD cycles in the kernel where + a workgroup could not be scheduled to a SIMD due to lack of available waveslots. + Insufficient SIMD VGPRs: The percent of total SIMD cycles in the kernel where + a workgroup could not be scheduled to a SIMD due to lack of available VGPRs. + Insufficient SIMD SGPRs: The percent of total SIMD cycles in the kernel where + a workgroup could not be scheduled to a SIMD due to lack of available SGPRs. + Insufficient CU LDS: The percent of total CU cycles in the kernel where a workgroup + could not be scheduled to a CU due to lack of available LDS. + Insufficient CU Barriers: The percent of total CU cycles in the kernel where a + workgroup could not be scheduled to a CU due to lack of available barriers. + Reached CU Workgroup Limit: The percent of total CU cycles in the kernel where + a workgroup could not be scheduled to a CU due to limits within the workgroup + manager. This is expected to be always be zero on CDNA2 or newer accelerators + (and small for previous accelerators). + Reached CU Wavefront Limit: The percent of total CU cycles in the kernel where + a wavefront could not be scheduled to a CU due to limits within the workgroup + manager. This is expected to be always be zero on CDNA2 or newer accelerators + (and small for previous accelerators). + data source: + - metric_table: + id: 601 + title: Workgroup manager utilizations + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Accelerator Utilization: + avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) + min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) + max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) + unit: Pct + Scheduler-Pipe Utilization: + avg: AVG(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu + * $se_per_gpu)) + min: MIN(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu + * $se_per_gpu)) + max: MAX(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu + * $se_per_gpu)) + unit: Pct + Workgroup Manager Utilization: + avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) + min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) + max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) + unit: Pct + Shader Engine Utilization: + avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) + min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) + max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) + unit: Pct + SIMD Utilization: + avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Dispatched Workgroups: + avg: AVG(SPI_CSN_NUM_THREADGROUPS) + min: MIN(SPI_CSN_NUM_THREADGROUPS) + max: MAX(SPI_CSN_NUM_THREADGROUPS) + unit: Workgroups + Dispatched Wavefronts: + avg: AVG(SPI_CSN_WAVE) + min: MIN(SPI_CSN_WAVE) + max: MAX(SPI_CSN_WAVE) + unit: Wavefronts + VGPR Writes: + avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + unit: Cycles/wave + SGPR Writes: + avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + unit: Cycles/wave + - metric_table: + id: 602 + title: Workgroup Manager - Resource Allocation + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Not-scheduled Rate (Workgroup Manager): + avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + unit: Pct + Not-scheduled Rate (Scheduler-Pipe): + avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + unit: Pct + Scheduler-Pipe Stall Rate: + avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) + min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) + max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) + unit: Pct + Scratch Stall Rate: + avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + unit: Pct + Insufficient SIMD Waveslots: + avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Insufficient SIMD VGPRs: + avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Insufficient SIMD SGPRs: + avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Insufficient CU LDS: + avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Insufficient CU Barriers: + avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Reached CU Workgroup Limit: + avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Reached CU Wavefront Limit: + avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0700_wavefront-launch.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0700_wavefront-launch.yaml deleted file mode 100644 index cc650e9bc0..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0700_wavefront-launch.yaml +++ /dev/null @@ -1,142 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 700 - title: Wavefront - data source: - - metric_table: - id: 701 - title: Wavefront Launch Stats - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Grid Size: - avg: AVG(Grid_Size) - min: MIN(Grid_Size) - max: MAX(Grid_Size) - unit: Work Items - tips: - Workgroup Size: - avg: AVG(Workgroup_Size) - min: MIN(Workgroup_Size) - max: MAX(Workgroup_Size) - unit: Work Items - tips: - Total Wavefronts: - avg: AVG(SPI_CSN_WAVE) - min: MIN(SPI_CSN_WAVE) - max: MAX(SPI_CSN_WAVE) - unit: Wavefronts - tips: - Saved Wavefronts: - avg: AVG(SQ_WAVES_SAVED) - min: MIN(SQ_WAVES_SAVED) - max: MAX(SQ_WAVES_SAVED) - unit: Wavefronts - tips: - Restored Wavefronts: - avg: AVG(SQ_WAVES_RESTORED) - min: MIN(SQ_WAVES_RESTORED) - max: MAX(SQ_WAVES_RESTORED) - unit: Wavefronts - tips: - VGPRs: - avg: AVG(Arch_VGPR) - min: MIN(Arch_VGPR) - max: MAX(Arch_VGPR) - unit: Registers - tips: - AGPRs: - avg: AVG(Accum_VGPR) - min: MIN(Accum_VGPR) - max: MAX(Accum_VGPR) - unit: Registers - tips: - SGPRs: - avg: AVG(SGPR) - min: MIN(SGPR) - max: MAX(SGPR) - unit: Registers - tips: - LDS Allocation: - avg: AVG(LDS_Per_Workgroup) - min: MIN(LDS_Per_Workgroup) - max: MAX(LDS_Per_Workgroup) - unit: Bytes - tips: - Scratch Allocation: - avg: AVG(Scratch_Per_Workitem) - min: MIN(Scratch_Per_Workitem) - max: MAX(Scratch_Per_Workitem) - unit: Bytes/Workitem - tips: - - - metric_table: - id: 702 - title: Wavefront Runtime Stats - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Kernel Time: - avg: AVG((End_Timestamp - Start_Timestamp)) - min: MIN((End_Timestamp - Start_Timestamp)) - max: MAX((End_Timestamp - Start_Timestamp)) - unit: ns - tips: - Kernel Time (Cycles): - avg: AVG($GRBM_GUI_ACTIVE_PER_XCD) - min: MIN($GRBM_GUI_ACTIVE_PER_XCD) - max: MAX($GRBM_GUI_ACTIVE_PER_XCD) - unit: Cycle - tips: - Instructions per wavefront: - avg: AVG((SQ_INSTS / SQ_WAVES)) - min: MIN((SQ_INSTS / SQ_WAVES)) - max: MAX((SQ_INSTS / SQ_WAVES)) - unit: Instr/wavefront - tips: - Wave Cycles: - avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom)) - min: MIN(((4 * SQ_WAVE_CYCLES) / $denom)) - max: MAX(((4 * SQ_WAVE_CYCLES) / $denom)) - unit: (Cycles + $normUnit) - tips: - Dependency Wait Cycles: - avg: AVG(((4 * SQ_WAIT_ANY) / $denom)) - min: MIN(((4 * SQ_WAIT_ANY) / $denom)) - max: MAX(((4 * SQ_WAIT_ANY) / $denom)) - unit: (Cycles + $normUnit) - tips: - Issue Wait Cycles: - avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom)) - min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom)) - max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom)) - unit: (Cycles + $normUnit) - tips: - Active Cycles: - avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom)) - min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom)) - max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom)) - unit: (Cycles + $normUnit) - tips: - Wavefront Occupancy: - avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - unit: Wavefronts - coll_level: SQ_LEVEL_WAVES - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0700_wavefront.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0700_wavefront.yaml new file mode 100644 index 0000000000..5e332c0b8f --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0700_wavefront.yaml @@ -0,0 +1,173 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 700 + title: Wavefront + metrics_description: + Grid Size: The total number of work-items (or, threads) launched as a part of + the kernel dispatch. In HIP, this is equivalent to the total grid size multiplied + by the total workgroup (or, block) size. + Workgroup Size: The total number of work-items (or, threads) in each workgroup + (or, block) launched as part of the kernel dispatch. In HIP, this is equivalent + to the total block size. + Total Wavefronts: "The total number of wavefronts launched as part of the kernel\ + \ dispatch. On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs,\ + \ the wavefront size is always 64 work-items. Thus, the total number of wavefronts\ + \ should be equivalent to the ceiling of grid size divided by 64." + Saved Wavefronts: The total number of wavefronts saved at a context-save. + Restored Wavefronts: The total number of wavefronts restored from a context-save. + VGPRs: 'The number of architected vector general-purpose registers allocated for + the kernel, see VALU. Note: this may not exactly match the number of VGPRs requested + by the compiler due to allocation granularity.' + AGPRs: 'The number of accumulation vector general-purpose registers allocated + for the kernel, see AGPRs. Note: this may not exactly match the number of AGPRs + requested by the compiler due to allocation granularity.' + SGPRs: 'The number of scalar general-purpose registers allocated for the kernel, + see SALU. Note: this may not exactly match the number of SGPRs requested by + the compiler due to allocation granularity.' + LDS Allocation: 'The number of bytes of LDS memory (or, shared memory) allocated + for this kernel. Note: This may also be larger than what was requested at compile + time due to both allocation granularity and dynamic per-dispatch LDS allocations.' + Scratch Allocation: The number of bytes of scratch memory requested per work-item + for this kernel. Scratch memory is used for stack memory on the accelerator, + as well as for register spills and restores. + Kernel Time: The total duration of the executed kernel. + Kernel Time (Cycles): The total duration of the executed kernel in cycles. + Instructions per wavefront: The average number of instructions (of all types) + executed per wavefront. This is averaged over all wavefronts in a kernel dispatch. + Wave Cycles: The number of cycles a wavefront in the kernel dispatch spent resident + on a compute unit per normalization unit. This is averaged over all wavefronts + in a kernel dispatch. + Dependency Wait Cycles: The number of cycles a wavefront in the kernel dispatch + spent resident on a compute unit per normalization unit. This is averaged over + all wavefronts in a kernel dispatch. + Issue Wait Cycles: The number of cycles a wavefront in the kernel dispatch was + unable to issue an instruction for any reason (e.g., execution pipe back-pressure, + arbitration loss, etc.) per normalization unit. This counter is incremented + at every cycle by all wavefronts on a CU unable to issue an instruction. As + such, it is most useful to get a sense of how waves were spending their time, + rather than identification of a precise limiter because another wave could be + actively executing while a wave is issue stalled. The sum of this metric, Dependency + Wait Cycles and Active Cycles should be equal to the total Wave Cycles metric. + Active Cycles: The average number of cycles a wavefront in the kernel dispatch + was actively executing instructions per normalization unit. This measurement + is made on a per-wavefront basis, and may include cycles that another wavefront + spent actively executing (on another execution unit, for example) or was stalled. + As such, it is most useful to get a sense of how waves were spending their time, + rather than identification of a precise limiter. The sum of this metric, Issue + Wait Cycles and Active Wait Cycles should be equal to the total Wave Cycles + metric. + Wavefront Occupancy: 'The time-averaged number of wavefronts resident on the accelerator + over the lifetime of the kernel. Note: this metric may be inaccurate for short-running + kernels (less than 1ms).' + data source: + - metric_table: + id: 701 + title: Wavefront Launch Stats + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Grid Size: + avg: AVG(Grid_Size) + min: MIN(Grid_Size) + max: MAX(Grid_Size) + unit: Work Items + Workgroup Size: + avg: AVG(Workgroup_Size) + min: MIN(Workgroup_Size) + max: MAX(Workgroup_Size) + unit: Work Items + Total Wavefronts: + avg: AVG(SPI_CSN_WAVE) + min: MIN(SPI_CSN_WAVE) + max: MAX(SPI_CSN_WAVE) + unit: Wavefronts + Saved Wavefronts: + avg: AVG(SQ_WAVES_SAVED) + min: MIN(SQ_WAVES_SAVED) + max: MAX(SQ_WAVES_SAVED) + unit: Wavefronts + Restored Wavefronts: + avg: AVG(SQ_WAVES_RESTORED) + min: MIN(SQ_WAVES_RESTORED) + max: MAX(SQ_WAVES_RESTORED) + unit: Wavefronts + VGPRs: + avg: AVG(Arch_VGPR) + min: MIN(Arch_VGPR) + max: MAX(Arch_VGPR) + unit: Registers + AGPRs: + avg: AVG(Accum_VGPR) + min: MIN(Accum_VGPR) + max: MAX(Accum_VGPR) + unit: Registers + SGPRs: + avg: AVG(SGPR) + min: MIN(SGPR) + max: MAX(SGPR) + unit: Registers + LDS Allocation: + avg: AVG(LDS_Per_Workgroup) + min: MIN(LDS_Per_Workgroup) + max: MAX(LDS_Per_Workgroup) + unit: Bytes + Scratch Allocation: + avg: AVG(Scratch_Per_Workitem) + min: MIN(Scratch_Per_Workitem) + max: MAX(Scratch_Per_Workitem) + unit: Bytes/Workitem + - metric_table: + id: 702 + title: Wavefront Runtime Stats + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Kernel Time: + avg: AVG((End_Timestamp - Start_Timestamp)) + min: MIN((End_Timestamp - Start_Timestamp)) + max: MAX((End_Timestamp - Start_Timestamp)) + unit: ns + Kernel Time (Cycles): + avg: AVG($GRBM_GUI_ACTIVE_PER_XCD) + min: MIN($GRBM_GUI_ACTIVE_PER_XCD) + max: MAX($GRBM_GUI_ACTIVE_PER_XCD) + unit: Cycle + Instructions per wavefront: + avg: AVG((SQ_INSTS / SQ_WAVES)) + min: MIN((SQ_INSTS / SQ_WAVES)) + max: MAX((SQ_INSTS / SQ_WAVES)) + unit: Instr/wavefront + Wave Cycles: + avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom)) + min: MIN(((4 * SQ_WAVE_CYCLES) / $denom)) + max: MAX(((4 * SQ_WAVE_CYCLES) / $denom)) + unit: (Cycles + $normUnit) + Dependency Wait Cycles: + avg: AVG(((4 * SQ_WAIT_ANY) / $denom)) + min: MIN(((4 * SQ_WAIT_ANY) / $denom)) + max: MAX(((4 * SQ_WAIT_ANY) / $denom)) + unit: (Cycles + $normUnit) + Issue Wait Cycles: + avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom)) + min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom)) + max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom)) + unit: (Cycles + $normUnit) + Active Cycles: + avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom)) + min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom)) + max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom)) + unit: (Cycles + $normUnit) + Wavefront Occupancy: + avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + unit: Wavefronts + coll_level: SQ_LEVEL_WAVES diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1000_compute-unit-instruction-mix.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1000_compute-unit-instruction-mix.yaml deleted file mode 100644 index d980e784a4..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1000_compute-unit-instruction-mix.yaml +++ /dev/null @@ -1,129 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1000 - title: Compute Units - Instruction Mix - data source: - - metric_table: - id: 1001 - title: Overall Instruction Mix - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - LDS: - avg: AVG((SQ_INSTS_LDS / $denom)) - min: MIN((SQ_INSTS_LDS / $denom)) - max: MAX((SQ_INSTS_LDS / $denom)) - unit: (instr + $normUnit) - tips: - SALU: - avg: AVG((SQ_INSTS_SALU / $denom)) - min: MIN((SQ_INSTS_SALU / $denom)) - max: MAX((SQ_INSTS_SALU / $denom)) - unit: (instr + $normUnit) - tips: - SMEM: - avg: AVG((SQ_INSTS_SMEM / $denom)) - min: MIN((SQ_INSTS_SMEM / $denom)) - max: MAX((SQ_INSTS_SMEM / $denom)) - unit: (instr + $normUnit) - tips: - Branch: - avg: AVG((SQ_INSTS_BRANCH / $denom)) - min: MIN((SQ_INSTS_BRANCH / $denom)) - max: MAX((SQ_INSTS_BRANCH / $denom)) - unit: (instr + $normUnit) - tips: - - - metric_table: - id: 1002 - title: VALU Arithmetic Instr Mix - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - - - metric_table: - id: 1003 - title: VMEM Instr Mix - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Global/Generic Instr: - avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Global/Generic Read: - avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Global/Generic Write: - avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Global/Generic Atomic: - avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Spill/Stack Instr: - avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Spill/Stack Read: - avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Spill/Stack Write: - avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Spill/Stack Atomic: - avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - - - metric_table: - id: 1004 - title: MFMA Arithmetic Instr Mix - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1000_compute_units_instruction_mix.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1000_compute_units_instruction_mix.yaml new file mode 100644 index 0000000000..b820b8de60 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1000_compute_units_instruction_mix.yaml @@ -0,0 +1,189 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1000 + title: Compute Units - Instruction Mix + metrics_description: + VALU: The total number of vector arithmetic logic unit (VALU) operations issued. + These are the workhorses of the compute unit, and are used to execute a wide + range of instruction types including floating point operations, non-uniform + address calculations, transcendental operations, integer operations, shifts, + conditional evaluation, etc. + VMEM: The total number of vector memory operations issued. These include most + loads, stores and atomic operations and all accesses to generic, global, private + and texture memory. + LDS: The total number of LDS (also known as shared memory) operations issued. + These include loads, stores, atomics, and HIP's __shfl operations. + MFMA: The total number of matrix fused multiply-add instructions issued. + SALU: The total number of scalar arithmetic logic unit (SALU) operations issued. + Typically these are used for address calculations, literal constants, and other + operations that are provably uniform across a wavefront. Although scalar memory + (SMEM) operations are issued by the SALU, they are counted separately in this + section. + SMEM: The total number of scalar memory (SMEM) operations issued. These are typically + used for loading kernel arguments, base-pointers and loads from HIP's __constant__ + memory. + Branch: The total number of branch operations issued. These typically consist + of jump or branch operations and are used to implement control flow. + INT32: The total number of instructions operating on 32-bit integer operands issued + to the VALU per normalization unit. + INT64: The total number of instructions operating on 64-bit integer operands issued + to the VALU per normalization unit. + F16-ADD: The total number of addition instructions operating on 16-bit floating-point + operands issued to the VALU per normalization unit. + F16-MUL: The total number of multiplication instructions operating on 16-bit floating-point + operands issued to the VALU per normalization unit. + F16-FMA: The total number of fused multiply-add instructions operating on 16-bit + floating-point operands issued to the VALU per normalization unit. + F16-Trans: The total number of transcendental instructions (e.g., sqrt) operating + on 16-bit floating-point operands issued to the VALU per normalization unit. + F32-ADD: The total number of addition instructions operating on 32-bit floating-point + operands issued to the VALU per normalization unit. + F32-MUL: The total number of multiplication instructions operating on 32-bit floating-point + operands issued to the VALU per normalization unit. + F32-FMA: The total number of fused multiply-add instructions operating on 32-bit + floating-point operands issued to the VALU per normalization unit. + F32-Trans: The total number of transcendental instructions (such as sqrt) operating + on 32-bit floating-point operands issued to the VALU per normalization unit. + F64-ADD: The total number of addition instructions operating on 64-bit floating-point + operands issued to the VALU per normalization unit. + F64-MUL: The total number of multiplication instructions operating on 64-bit floating-point + operands issued to the VALU per normalization unit. + F64-FMA: The total number of fused multiply-add instructions operating on 64-bit + floating-point operands issued to the VALU per normalization unit. + F64-Trans: The total number of transcendental instructions (such as sqrt) operating + on 64-bit floating-point operands issued to the VALU per normalization unit. + Conversion: "The total number of type conversion instructions (such as converting\ + \ data to or from F32\u2194F64) issued to the VALU per normalization unit." + Global/Generic Instr: The total number of global & generic memory instructions + executed on all compute units on the accelerator, per normalization unit. + Global/Generic Read: The total number of global & generic memory read instructions + executed on all compute units on the accelerator, per normalization unit. + Global/Generic Write: The total number of global & generic memory write instructions + executed on all compute units on the accelerator, per normalization unit. + Global/Generic Atomic: The total number of global & generic memory atomic (with + and without return) instructions executed on all compute units on the accelerator, + per normalization unit. + Spill/Stack Instr: The total number of spill/stack memory instructions executed + on all compute units on the accelerator, per normalization unit. + Spill/Stack Read: The total number of spill/stack memory read instructions executed + on all compute units on the accelerator, per normalization unit. + Spill/Stack Write: The total number of spill/stack memory write instructions executed + on all compute units on the accelerator, per normalization unit. + Spill/Stack Atomic: The total number of spill/stack memory atomic (with and without + return) instructions executed on all compute units on the accelerator, per normalization + unit. Typically unused as these memory operations are typically used to implement + thread-local storage. + MFMA-I8: The total number of 8-bit integer MFMA instructions issued per normalization + unit. + MFMA-F8: The total number of 8-bit floating point MFMA instructions issued per + normalization unit. This is supported in AMD Instinct MI300 series and later + only. + MFMA-F16: The total number of 16-bit floating point MFMA instructions issued per + normalization unit. + MFMA-BF16: The total number of 16-bit brain floating point MFMA instructions issued + per normalization unit. + MFMA-F32: The total number of 32-bit floating-point MFMA instructions issued per + normalization unit. + MFMA-F64: The total number of 64-bit floating-point MFMA instructions issued per + normalization unit. + data source: + - metric_table: + id: 1001 + title: Overall Instruction Mix + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + LDS: + avg: AVG((SQ_INSTS_LDS / $denom)) + min: MIN((SQ_INSTS_LDS / $denom)) + max: MAX((SQ_INSTS_LDS / $denom)) + unit: (instr + $normUnit) + SALU: + avg: AVG((SQ_INSTS_SALU / $denom)) + min: MIN((SQ_INSTS_SALU / $denom)) + max: MAX((SQ_INSTS_SALU / $denom)) + unit: (instr + $normUnit) + SMEM: + avg: AVG((SQ_INSTS_SMEM / $denom)) + min: MIN((SQ_INSTS_SMEM / $denom)) + max: MAX((SQ_INSTS_SMEM / $denom)) + unit: (instr + $normUnit) + Branch: + avg: AVG((SQ_INSTS_BRANCH / $denom)) + min: MIN((SQ_INSTS_BRANCH / $denom)) + max: MAX((SQ_INSTS_BRANCH / $denom)) + unit: (instr + $normUnit) + - metric_table: + id: 1002 + title: VALU Arithmetic Instruction Mix + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: {} + - metric_table: + id: 1003 + title: VMEM Instruction Mix + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Global/Generic Instr: + avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Global/Generic Read: + avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Global/Generic Write: + avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Global/Generic Atomic: + avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Instr: + avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Read: + avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Write: + avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Atomic: + avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + - metric_table: + id: 1004 + title: MFMA Arithmetic Instruction Mix + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: {} diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1100_compute-unit-compute-pipeline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1100_compute-unit-compute-pipeline.yaml deleted file mode 100644 index 2021ff08ea..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1100_compute-unit-compute-pipeline.yaml +++ /dev/null @@ -1,84 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1100 - title: Compute Units - Compute Pipeline - data source: - - metric_table: - id: 1101 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - peak: Peak - pop: Pct of Peak - tips: Tips - metric: - - - metric_table: - id: 1102 - title: Pipeline Stats - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - IPC: - avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - unit: Instr/cycle - tips: - IPC (Issued): - avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) - + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) - / SQ_ACTIVE_INST_ANY)) - min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) - + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) - / SQ_ACTIVE_INST_ANY)) - max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) - + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) - / SQ_ACTIVE_INST_ANY)) - unit: Instr/cycle - tips: - SALU Utilization: - avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - tips: - VALU Utilization: - avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - tips: - VALU Active Threads: - avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - unit: Threads - tips: - - - metric_table: - id: 1103 - title: Arithmetic Operations - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1100_compute_units_compute_pipeline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1100_compute_units_compute_pipeline.yaml new file mode 100644 index 0000000000..9dd3dc97c4 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1100_compute_units_compute_pipeline.yaml @@ -0,0 +1,147 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1100 + title: Compute Units - Compute Pipeline + metrics_description: + VALU FLOPs: 'The total floating-point operations executed per second on the VALU. + This is also presented as a percent of the peak theoretical FLOPs achievable + on the specific accelerator. Note: this does not include any floating-point + operations from MFMA instructions.' + VALU IOPs: 'The total integer operations executed per second on the VALU. This + is also presented as a percent of the peak theoretical IOPs achievable on the + specific accelerator. Note: this does not include any integer operations from + MFMA instructions.' + MFMA FLOPs (BF16): 'The total number of 16-bit brain floating point MFMA operations + executed per second. Note: this does not include any 16-bit brain floating point + operations from VALU instructions. This is also presented as a percent of the + peak theoretical BF16 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F16): 'The total number of 16-bit floating point MFMA operations executed + per second. Note: this does not include any 16-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F16 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F32): 'The total number of 32-bit floating point MFMA operations executed + per second. Note: this does not include any 32-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F32 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F64): 'The total number of 64-bit floating point MFMA operations executed + per second. Note: this does not include any 64-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F64 MFMA operations achievable on the specific accelerator.' + MFMA IOPs (INT8): 'The total number of 8-bit integer MFMA operations executed + per second. Note: this does not include any 8-bit integer operations from VALU + instructions. This is also presented as a percent of the peak theoretical INT8 + MFMA operations achievable on the specific accelerator.' + IPC: The ratio of the total number of instructions executed on the CU over the + total active CU cycles. + IPC (Issued): The ratio of the total number of (non-internal) instructions issued + over the number of cycles where the scheduler was actively working on issuing + instructions. + SALU Utilization: Indicates what percent of the kernel's duration the SALU was + busy executing instructions. Computed as the ratio of the total number of cycles + spent by the scheduler issuing SALU / SMEM instructions over the total CU cycles. + VALU Utilization: Indicates what percent of the kernel's duration the VALU was + busy executing instructions. Does not include VMEM operations. Computed as the + ratio of the total number of cycles spent by the scheduler issuing VALU instructions + over the total CU cycles. + VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit + was busy executing instructions, including both global/generic and spill/scratch + operations (see the VMEM instruction count metrics for more detail). Does not + include VALU operations. Computed as the ratio of the total number of cycles + spent by the scheduler issuing VMEM instructions over the total CU cycles. + Branch Utilization: Indicates what percent of the kernel's duration the branch + unit was busy executing instructions. Computed as the ratio of the total number + of cycles spent by the scheduler issuing branch instructions over the total + CU cycles. + VALU Active Threads: Indicates the average level of divergence within a wavefront + over the lifetime of the kernel. The number of work-items that were active in + a wavefront during execution of each VALU instruction, time-averaged over all + VALU instructions run on all wavefronts in the kernel + MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit + was busy executing instructions. Computed as the ratio of the total number of + cycles spent by the MFMA was busy over the total CU cycles. + MFMA Instruction Cycles: The average duration of MFMA instructions in this kernel + in cycles. Computed as the ratio of the total number of cycles the MFMA unit + was busy over the total number of MFMA instructions. + VMEM Latency: The average number of round-trip cycles (that is, from issue to + data return / acknowledgment) required for a VMEM instruction to complete. + SMEM Latency: The average number of round-trip cycles (that is, from issue to + data return / acknowledgment) required for a SMEM instruction to complete. + FLOPs (Total): The total number of floating-point operations executed on either + the VALU or MFMA units, per normalization unit. + IOPs (Total): The total number of integer operations executed on either the VALU + or MFMA units, per normalization unit. + F16 OPs: The total number of 16-bit floating-point operations executed on either + the VALU or MFMA units, per normalization unit. + BF16 OPs: The total number of 16-bit brain floating-point operations executed + on either the VALU or MFMA units, per normalization unit. + F32 OPs: The total number of 32-bit floating-point operations executed on either + the VALU or MFMA units, per normalization unit. + F64 OPs: The total number of 64-bit floating-point operations executed on either + the VALU or MFMA units, per normalization unit. + INT8 OPs: The total number of 8-bit integer operations executed on either the + VALU or MFMA units, per normalization unit. + data source: + - metric_table: + id: 1101 + title: Compute Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + peak: Peak + pop: Pct of Peak + metric: {} + - metric_table: + id: 1102 + title: Pipeline Statistics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + IPC: + avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + unit: Instr/cycle + IPC (Issued): + avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) + / SQ_ACTIVE_INST_ANY)) + min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) + / SQ_ACTIVE_INST_ANY)) + max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) + / SQ_ACTIVE_INST_ANY)) + unit: Instr/cycle + SALU Utilization: + avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + VALU Utilization: + avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + VALU Active Threads: + avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + unit: Threads + - metric_table: + id: 1103 + title: Arithmetic Operations + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: {} diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1200_lds.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1200_lds.yaml deleted file mode 100644 index 2c3fc34b2a..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1200_lds.yaml +++ /dev/null @@ -1,118 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1200 - title: Local Data Share (LDS) - data source: - - metric_table: - id: 1201 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - tips: Tips - metric: - Utilization: - value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: Pct of Peak - tips: - Access Rate: - value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: Pct of Peak - tips: - Theoretical Bandwidth: - value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * 0.00128))) - unit: Pct of Peak - tips: - Bank Conflict Rate: - value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - unit: Pct of Peak - tips: - comparable: false # for now - cli_style: simple_bar - - - metric_table: - id: 1202 - title: LDS Stats - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - LDS Instrs: - avg: AVG((SQ_INSTS_LDS / $denom)) - min: MIN((SQ_INSTS_LDS / $denom)) - max: MAX((SQ_INSTS_LDS / $denom)) - unit: (Instr + $normUnit) - tips: - Theoretical Bandwidth: - avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / $denom)) - min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / $denom)) - max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / $denom)) - unit: (Bytes + $normUnit) - tips: - LDS Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)) - unit: Cycles - coll_level: SQ_INST_LEVEL_LDS - tips: - Bank Conflicts/Access: - avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - unit: Conflicts/Access - tips: - Index Accesses: - avg: AVG((SQ_LDS_IDX_ACTIVE / $denom)) - min: MIN((SQ_LDS_IDX_ACTIVE / $denom)) - max: MAX((SQ_LDS_IDX_ACTIVE / $denom)) - unit: (Cycles + $normUnit) - tips: - Atomic Return Cycles: - avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom)) - min: MIN((SQ_LDS_ATOMIC_RETURN / $denom)) - max: MAX((SQ_LDS_ATOMIC_RETURN / $denom)) - unit: (Cycles + $normUnit) - tips: - Bank Conflict: - avg: AVG((SQ_LDS_BANK_CONFLICT / $denom)) - min: MIN((SQ_LDS_BANK_CONFLICT / $denom)) - max: MAX((SQ_LDS_BANK_CONFLICT / $denom)) - unit: (Cycles + $normUnit) - tips: - Addr Conflict: - avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom)) - min: MIN((SQ_LDS_ADDR_CONFLICT / $denom)) - max: MAX((SQ_LDS_ADDR_CONFLICT / $denom)) - unit: (Cycles + $normUnit) - tips: - Unaligned Stall: - avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom)) - min: MIN((SQ_LDS_UNALIGNED_STALL / $denom)) - max: MAX((SQ_LDS_UNALIGNED_STALL / $denom)) - unit: (Cycles + $normUnit) - tips: - Mem Violations: - avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom)) - min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom)) - max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom)) - unit: (Accesses + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1200_local_data_share_lds.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1200_local_data_share_lds.yaml new file mode 100644 index 0000000000..6cfe19d9de --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1200_local_data_share_lds.yaml @@ -0,0 +1,141 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1200 + title: Local Data Share (LDS) + metrics_description: + Utilization: Indicates what percent of the kernel's duration the LDS was actively + executing instructions (including, but not limited to, load, store, atomic and + HIP's __shfl operations). Calculated as the ratio of the total number of cycles + LDS was active over the total CU cycles. + Access Rate: Indicates the percentage of SIMDs in the VALU actively issuing LDS + instructions, averaged over the lifetime of the kernel. Calculated as the ratio + of the total number of cycles spent by the scheduler issuing LDS instructions + over the total CU cycles. + Theoretical Bandwidth: Indicates the maximum amount of bytes that could have been + loaded from, stored to, or atomically updated in the LDS per normalization unit. + Does not take into account the execution mask of the wavefront when the instruction + was executed. + Bank Conflict Rate: Indicates the percentage of active LDS cycles that were spent + servicing bank conflicts. Calculated as the ratio of LDS cycles spent servicing + bank conflicts over the number of LDS cycles that would have been required to + move the same amount of data in an uncontended access. + LDS Instructions: The total number of LDS instructions (including, but not limited + to, read/write/atomics and HIP's __shfl instructions) executed per normalization + unit. + LDS Latency: The average number of round-trip cycles (i.e., from issue to data-return + / acknowledgment) required for an LDS instruction to complete. + Bank Conflicts/Access: The ratio of the number of cycles spent in the LDS scheduler + due to bank conflicts (as determined by the conflict resolution hardware) to + the base number of cycles that would be spent in the LDS scheduler in a completely + uncontended case. This is the unnormalized form of the Bank Conflict Rate. + Index Accesses: The total number of cycles spent in the LDS scheduler over all + operations per normalization unit. + Atomic Return Cycles: The total number of cycles spent on LDS atomics with return + per normalization unit. + Bank Conflict: The total number of cycles spent in the LDS scheduler due to bank + conflicts (as determined by the conflict resolution hardware) per normalization + unit. + Addr Conflict: The total number of cycles spent in the LDS scheduler due to address + conflicts (as determined by the conflict resolution hardware) per normalization + unit. + Unaligned Stall: The total number of cycles spent in the LDS scheduler due to + stalls from non-dword aligned addresses per normalization unit. + Mem Violations: "The total number of out-of-bounds accesses made to the LDS, per\ + \ normalization unit. This is unused and expected to be zero in most configurations\ + \ for modern CDNA\u2122 accelerators." + data source: + - metric_table: + id: 1201 + title: LDS Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + metric: + Utilization: + value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: Pct of Peak + Access Rate: + value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: Pct of Peak + Theoretical Bandwidth: + value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * 0.00128))) + unit: Pct of Peak + Bank Conflict Rate: + value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + unit: Pct of Peak + comparable: false + cli_style: simple_bar + tui_style: simple_bar + - metric_table: + id: 1202 + title: LDS Statistics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + LDS Instructions: + avg: AVG((SQ_INSTS_LDS / $denom)) + min: MIN((SQ_INSTS_LDS / $denom)) + max: MAX((SQ_INSTS_LDS / $denom)) + unit: (Instr + $normUnit) + Theoretical Bandwidth: + avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / $denom)) + min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / $denom)) + max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / $denom)) + unit: (Bytes + $normUnit) + LDS Latency: + avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else + None)) + min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else + None)) + max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else + None)) + unit: Cycles + coll_level: SQ_INST_LEVEL_LDS + Bank Conflicts/Access: + avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + unit: Conflicts/Access + Index Accesses: + avg: AVG((SQ_LDS_IDX_ACTIVE / $denom)) + min: MIN((SQ_LDS_IDX_ACTIVE / $denom)) + max: MAX((SQ_LDS_IDX_ACTIVE / $denom)) + unit: (Cycles + $normUnit) + Atomic Return Cycles: + avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom)) + min: MIN((SQ_LDS_ATOMIC_RETURN / $denom)) + max: MAX((SQ_LDS_ATOMIC_RETURN / $denom)) + unit: (Cycles + $normUnit) + Bank Conflict: + avg: AVG((SQ_LDS_BANK_CONFLICT / $denom)) + min: MIN((SQ_LDS_BANK_CONFLICT / $denom)) + max: MAX((SQ_LDS_BANK_CONFLICT / $denom)) + unit: (Cycles + $normUnit) + Addr Conflict: + avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom)) + min: MIN((SQ_LDS_ADDR_CONFLICT / $denom)) + max: MAX((SQ_LDS_ADDR_CONFLICT / $denom)) + unit: (Cycles + $normUnit) + Unaligned Stall: + avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom)) + min: MIN((SQ_LDS_UNALIGNED_STALL / $denom)) + max: MAX((SQ_LDS_UNALIGNED_STALL / $denom)) + unit: (Cycles + $normUnit) + Mem Violations: + avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom)) + min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom)) + max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom)) + unit: (Accesses + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1300_instruction-cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1300_instruction-cache.yaml deleted file mode 100644 index 209a42726e..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1300_instruction-cache.yaml +++ /dev/null @@ -1,105 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1300 - title: Instruction Cache - data source: - - metric_table: - id: 1301 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - tips: Tips - metric: - Bandwidth: - value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) - * (End_Timestamp - Start_Timestamp)))) - unit: Pct of Peak - tips: - Cache Hit Rate: - value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - unit: Pct of Peak - tips: - L1I-L2 Bandwidth: - value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu) - * (End_Timestamp - Start_Timestamp)))) - unit: Pct of Peak - tips: - comparable: false # for now - cli_style: simple_bar - - - metric_table: - id: 1302 - title: Instruction Cache Accesses - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Req: - avg: AVG((SQC_ICACHE_REQ / $denom)) - min: MIN((SQC_ICACHE_REQ / $denom)) - max: MAX((SQC_ICACHE_REQ / $denom)) - unit: (Req + $normUnit) - tips: - Hits: - avg: AVG((SQC_ICACHE_HITS / $denom)) - min: MIN((SQC_ICACHE_HITS / $denom)) - max: MAX((SQC_ICACHE_HITS / $denom)) - unit: (Hits + $normUnit) - tips: - Misses - Non Duplicated: - avg: AVG((SQC_ICACHE_MISSES / $denom)) - min: MIN((SQC_ICACHE_MISSES / $denom)) - max: MAX((SQC_ICACHE_MISSES / $denom)) - unit: (Misses + $normUnit) - tips: - Misses - Duplicated: - avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom)) - min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom)) - max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom)) - unit: (Misses + $normUnit) - tips: - Cache Hit Rate: - avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - unit: pct - tips: - Instruction Fetch Latency: - avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - unit: Cycles - coll_level: SQ_IFETCH_LEVEL - tips: - - metric_table: - id: 1303 - title: Instruction Cache - L2 Interface - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - L1I-L2 Bandwidth: - avg: AVG(((SQC_TC_INST_REQ * 64) / $denom)) - min: MIN(((SQC_TC_INST_REQ * 64) / $denom)) - max: MAX(((SQC_TC_INST_REQ * 64) / $denom)) - unit: (Bytes + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1300_instruction_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1300_instruction_cache.yaml new file mode 100644 index 0000000000..a53c23691f --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1300_instruction_cache.yaml @@ -0,0 +1,106 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1300 + title: Instruction Cache + metrics_description: + Bandwidth: The number of bytes looked up in the L1I cache, as a percent of the + peak theoretical bandwidth. Calculated as the ratio of L1I requests over the + total L1I cycles. + Cache Hit Rate: The percent of L1I requests that hit [#l1i-cache]_ on a previously + loaded line the cache. Calculated as the ratio of the number of L1I requests + that hit over the number of all L1I requests. + L1I-L2 Bandwidth: "The percent of the peak theoretical L1I \u2192 L2 cache request\ + \ bandwidth achieved. Calculated as the ratio of the total number of requests\ + \ from the L1I to the L2 cache over the total L1I-L2 interface cycles." + Req: The total number of requests made to the L1I per normalization-unit + Hits: The total number of L1I requests that hit on a previously loaded cache line, + per normalization-unit. + Misses - Non Duplicated: The total number of L1I requests that missed on a cache + line that were not already pending due to another request, per normalization-unit. + Misses - Duplicated: The total number of L1I requests that missed on a cache line + that were already pending due to another request, per normalization-unit. + Instruction Fetch Latency: The average number of cycles spent to fetch instructions + to a CU. + data source: + - metric_table: + id: 1301 + title: L1I Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + metric: + Bandwidth: + value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * (End_Timestamp + - Start_Timestamp)))) + unit: Pct of Peak + Cache Hit Rate: + value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + unit: Pct of Peak + L1I-L2 Bandwidth: + value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu) + * (End_Timestamp - Start_Timestamp)))) + unit: Pct of Peak + comparable: false + cli_style: simple_bar + tui_style: simple_bar + - metric_table: + id: 1302 + title: L1I cache accesses + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Req: + avg: AVG((SQC_ICACHE_REQ / $denom)) + min: MIN((SQC_ICACHE_REQ / $denom)) + max: MAX((SQC_ICACHE_REQ / $denom)) + unit: (Req + $normUnit) + Hits: + avg: AVG((SQC_ICACHE_HITS / $denom)) + min: MIN((SQC_ICACHE_HITS / $denom)) + max: MAX((SQC_ICACHE_HITS / $denom)) + unit: (Hits + $normUnit) + Misses - Non Duplicated: + avg: AVG((SQC_ICACHE_MISSES / $denom)) + min: MIN((SQC_ICACHE_MISSES / $denom)) + max: MAX((SQC_ICACHE_MISSES / $denom)) + unit: (Misses + $normUnit) + Misses - Duplicated: + avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom)) + min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom)) + max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom)) + unit: (Misses + $normUnit) + Cache Hit Rate: + avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + unit: pct + Instruction Fetch Latency: + avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + unit: Cycles + coll_level: SQ_IFETCH_LEVEL + - metric_table: + id: 1303 + title: L1I <-> L2 interface + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + L1I-L2 Bandwidth: + avg: AVG(((SQC_TC_INST_REQ * 64) / $denom)) + min: MIN(((SQC_TC_INST_REQ * 64) / $denom)) + max: MAX(((SQC_TC_INST_REQ * 64) / $denom)) + unit: (Bytes + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1400_constant-cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1400_constant-cache.yaml deleted file mode 100644 index 669a5834b9..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1400_constant-cache.yaml +++ /dev/null @@ -1,171 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1400 - title: Scalar L1 Data Cache - data source: - - metric_table: - id: 1401 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - tips: Tips - metric: - Bandwidth: - value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) - * (End_Timestamp - Start_Timestamp)))) - unit: Pct of Peak - tips: - Cache Hit Rate: - value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES + SQC_DCACHE_MISSES_DUPLICATE)) - if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - unit: Pct of Peak - tips: - sL1D-L2 BW: - value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 100000) - / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp))) - unit: Pct of Peak - tips: - comparable: false # for now - cli_style: simple_bar - - - metric_table: - id: 1402 - title: Scalar L1D Cache Accesses - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Req: - avg: AVG((SQC_DCACHE_REQ / $denom)) - min: MIN((SQC_DCACHE_REQ / $denom)) - max: MAX((SQC_DCACHE_REQ / $denom)) - unit: (Req + $normUnit) - tips: - Hits: - avg: AVG((SQC_DCACHE_HITS / $denom)) - min: MIN((SQC_DCACHE_HITS / $denom)) - max: MAX((SQC_DCACHE_HITS / $denom)) - unit: (Req + $normUnit) - tips: - Misses - Non Duplicated: - avg: AVG((SQC_DCACHE_MISSES / $denom)) - min: MIN((SQC_DCACHE_MISSES / $denom)) - max: MAX((SQC_DCACHE_MISSES / $denom)) - unit: (Req + $normUnit) - tips: - Misses- Duplicated: - avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom)) - min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom)) - max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom)) - unit: (Req + $normUnit) - tips: - Cache Hit Rate: - avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - unit: pct - tips: - Read Req (Total): - avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) - + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) - min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) - + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) - max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) - + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) - unit: (Req + $normUnit) - tips: - Atomic Req: - avg: AVG((SQC_DCACHE_ATOMIC / $denom)) - min: MIN((SQC_DCACHE_ATOMIC / $denom)) - max: MAX((SQC_DCACHE_ATOMIC / $denom)) - unit: (Req + $normUnit) - tips: - Read Req (1 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_1 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_1 / $denom)) - unit: (Req + $normUnit) - tips: - Read Req (2 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_2 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_2 / $denom)) - unit: (Req + $normUnit) - tips: - Read Req (4 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_4 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_4 / $denom)) - unit: (Req + $normUnit) - tips: - Read Req (8 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_8 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_8 / $denom)) - unit: (Req + $normUnit) - tips: - Read Req (16 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_16 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_16 / $denom)) - unit: (Req + $normUnit) - tips: - - - metric_table: - id: 1403 - title: Scalar L1D Cache - L2 Interface - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - sL1D-L2 BW: - avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom)) - min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom)) - max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom)) - unit: (Bytes + $normUnit) - tips: - Read Req: - avg: AVG((SQC_TC_DATA_READ_REQ / $denom)) - min: MIN((SQC_TC_DATA_READ_REQ / $denom)) - max: MAX((SQC_TC_DATA_READ_REQ / $denom)) - unit: (Req + $normUnit) - tips: - Write Req: - avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom)) - min: MIN((SQC_TC_DATA_WRITE_REQ / $denom)) - max: MAX((SQC_TC_DATA_WRITE_REQ / $denom)) - unit: (Req + $normUnit) - tips: - Atomic Req: - avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)) - min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom)) - max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom)) - unit: (Req + $normUnit) - tips: - Stall Cycles: - avg: AVG((SQC_TC_STALL / $denom)) - min: MIN((SQC_TC_STALL / $denom)) - max: MAX((SQC_TC_STALL / $denom)) - unit: (Cycles + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1400_scalar_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1400_scalar_l1_data_cache.yaml new file mode 100644 index 0000000000..d43157ce8e --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1400_scalar_l1_data_cache.yaml @@ -0,0 +1,186 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1400 + title: Scalar L1 Data Cache + metrics_description: + Bandwidth: The number of bytes looked up in the sL1D cache, as a percent of the + peak theoretical bandwidth. Calculated as the ratio of sL1D requests over the + total sL1D cycles. + Cache Hit Rate: Indicates the percent of sL1D requests that hit on a previously + loaded line the cache. The ratio of the number of sL1D requests that hit over + the number of all sL1D requests. + sL1D-L2 BW: "The total number of bytes read from, written to, or atomically updated\ + \ across the sL1D\u2194L2 interface, per normalization unit. Note that sL1D\ + \ writes and atomics are typically unused on current CDNA accelerators, so in\ + \ the majority of cases this can be interpreted as an sL1D\u2192L2 read bandwidth." + Req: The total number of requests, of any size or type, made to the sL1D per normalization + unit. + Hits: The total number of sL1D requests that hit on a previously loaded cache + line, per normalization unit. + Misses - Non Duplicated: 'The total number of sL1D requests that missed on a cache + line that was not already pending due to another request, per normalization + unit. ' + Misses- Duplicated: The total number of sL1D requests that missed on a cache line + that was already pending due to another request, per normalization unit. + Read Req (Total): The total number of sL1D read requests of any size, per normalization + unit. + Atomic Req: The total number of atomic requests from sL1D to the L2, per normalization + unit. Typically unused on current CDNA accelerators. + Read Req (1 DWord): The total number of sL1D read requests made for a single dword + of data (4B), per normalization unit. + Read Req (2 DWord): The total number of sL1D read requests made for a two dwords + of data (8B), per normalization unit. + Read Req (4 DWord): The total number of sL1D read requests made for a four dwords + of data (16B), per normalization unit. + Read Req (8 DWord): The total number of sL1D read requests made for a eight dwords + of data (32B), per normalization unit. + Read Req (16 DWord): The total number of sL1D read requests made for a sixteen + dwords of data (64B), per normalization unit. + Read Req: The total number of read requests from sL1D to the L2 per normalization + unit. + Write Req: The total number of write requests from sL1D to the L2, per normalization + unit. Typically unused on current CDNA accelerators. + Stall Cycles: "The total number of cycles the sL1D\u2194L2 interface was stalled,\ + \ per normalization unit." + data source: + - metric_table: + id: 1401 + title: Scalar L1D Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + metric: + Bandwidth: + value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * (End_Timestamp + - Start_Timestamp)))) + unit: Pct of Peak + Cache Hit Rate: + value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES + + SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + unit: Pct of Peak + sL1D-L2 BW: + value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp))) + unit: Pct of Peak + comparable: false + cli_style: simple_bar + tui_style: simple_bar + - metric_table: + id: 1402 + title: Scalar L1D cache accesses + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Req: + avg: AVG((SQC_DCACHE_REQ / $denom)) + min: MIN((SQC_DCACHE_REQ / $denom)) + max: MAX((SQC_DCACHE_REQ / $denom)) + unit: (Req + $normUnit) + Hits: + avg: AVG((SQC_DCACHE_HITS / $denom)) + min: MIN((SQC_DCACHE_HITS / $denom)) + max: MAX((SQC_DCACHE_HITS / $denom)) + unit: (Req + $normUnit) + Misses - Non Duplicated: + avg: AVG((SQC_DCACHE_MISSES / $denom)) + min: MIN((SQC_DCACHE_MISSES / $denom)) + max: MAX((SQC_DCACHE_MISSES / $denom)) + unit: (Req + $normUnit) + Misses- Duplicated: + avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom)) + min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom)) + max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom)) + unit: (Req + $normUnit) + Cache Hit Rate: + avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + unit: pct + Read Req (Total): + avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) + min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) + max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG((SQC_DCACHE_ATOMIC / $denom)) + min: MIN((SQC_DCACHE_ATOMIC / $denom)) + max: MAX((SQC_DCACHE_ATOMIC / $denom)) + unit: (Req + $normUnit) + Read Req (1 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_1 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_1 / $denom)) + unit: (Req + $normUnit) + Read Req (2 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_2 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_2 / $denom)) + unit: (Req + $normUnit) + Read Req (4 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_4 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_4 / $denom)) + unit: (Req + $normUnit) + Read Req (8 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_8 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_8 / $denom)) + unit: (Req + $normUnit) + Read Req (16 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_16 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_16 / $denom)) + unit: (Req + $normUnit) + - metric_table: + id: 1403 + title: Scalar L1D Cache - L2 Interface + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + sL1D-L2 BW: + avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 64)) / $denom)) + min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 64)) / $denom)) + max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 64)) / $denom)) + unit: (Bytes + $normUnit) + Read Req: + avg: AVG((SQC_TC_DATA_READ_REQ / $denom)) + min: MIN((SQC_TC_DATA_READ_REQ / $denom)) + max: MAX((SQC_TC_DATA_READ_REQ / $denom)) + unit: (Req + $normUnit) + Write Req: + avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom)) + min: MIN((SQC_TC_DATA_WRITE_REQ / $denom)) + max: MAX((SQC_TC_DATA_WRITE_REQ / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)) + min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom)) + max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom)) + unit: (Req + $normUnit) + Stall Cycles: + avg: AVG((SQC_TC_STALL / $denom)) + min: MIN((SQC_TC_STALL / $denom)) + max: MAX((SQC_TC_STALL / $denom)) + unit: (Cycles + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1500_TA_and_TD.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1500_TA_and_TD.yaml deleted file mode 100644 index a59975bf17..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1500_TA_and_TD.yaml +++ /dev/null @@ -1,168 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1500 - title: Address Processing Unit and Data Return Path (TA/TD) - data source: - - metric_table: - id: 1501 - title: Address Processing Unit - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Address Processing Unit Busy: - avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Address Stall: - avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Data Stall: - avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Data-Processor → Address Stall: - avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Total Instructions: - avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom)) - min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom)) - max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Global/Generic Instructions: - avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Global/Generic Read Instructions: - avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Global/Generic Write Instructions: - avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Global/Generic Atomic Instructions: - avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Spill/Stack Instructions: - avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Spill/Stack Read Instructions: - avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Spill/Stack Write Instructions: - avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Spill/Stack Atomic Instructions: - avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Spill/Stack Total Cycles: - avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) - min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) - max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) - unit: (Cycles + $normUnit) - tips: - Spill/Stack Coalesced Read: - avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) - min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) - max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) - unit: (Cycles + $normUnit) - tips: - Spill/Stack Coalesced Write: - avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) - min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) - max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) - unit: (Cycles + $normUnit) - tips: - - - metric_table: - id: 1502 - title: Data-Return Path - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Data-Return Busy: - avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Cache RAM → Data-Return Stall: - avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Coalescable Instructions: - avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom)) - min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom)) - max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Read Instructions: - avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) - / $denom)) - min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) - / $denom)) - max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) - / $denom)) - unit: (Instructions + $normUnit) - tips: - Write Instructions: - avg: AVG((TD_STORE_WAVEFRONT_sum / $denom)) - min: MIN((TD_STORE_WAVEFRONT_sum / $denom)) - max: MAX((TD_STORE_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Atomic Instructions: - avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom)) - min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom)) - max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1500_address_processing_unit_and_data_return_path_ta_td.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1500_address_processing_unit_and_data_return_path_ta_td.yaml new file mode 100644 index 0000000000..754cbbb688 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1500_address_processing_unit_and_data_return_path_ta_td.yaml @@ -0,0 +1,233 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1500 + title: Address Processing Unit and Data Return Path (TA/TD) + metrics_description: + Address Processing Unit Busy: Percent of the total CU cycles the address processor + was busy + Address Stall: Percent of the total CU cycles the address processor was stalled + from sending address requests further into the vL1D pipeline. + Data Stall: Percent of the total CU cycles the address processor was stalled from + sending write/atomic data further into the vL1D pipeline. + "Data-Processor \u2192 Address Stall": Percent of total CU cycles the address + processor was stalled waiting to send command data to the data processor. + Total Instructions: The total number of memory instructions executed by the address + processer over all compute units on the accelerator, per normalization unit. + Global/Generic Instructions: The total number of global & generic memory instructions + executed on all compute units on the accelerator, per normalization unit. + Global/Generic Read Instructions: The total number of global & generic memory + read instructions executed on all compute units on the accelerator, per normalization + unit. + Global/Generic Write Instructions: The total number of global & generic memory + write instructions executed on all compute units on the accelerator, per normalization + unit. + Global/Generic Atomic Instructions: The total number of global & generic memory + atomic (with and without return) instructions executed on all compute units + on the accelerator, per normalization unit. + Spill/Stack Instructions: The total number of spill/stack memory instructions + executed on all compute units on the accelerator, per normalization unit. + Spill/Stack Read Instructions: The total number of spill/stack memory read instructions + executed on all compute units on the accelerator, per normalization unit. + Spill/Stack Write Instructions: The total number of spill/stack memory write instructions + executed on all compute units on the accelerator, per normalization unit. + Spill/Stack Atomic Instructions: The total number of spill/stack memory atomic + (with and without return) instructions executed on all compute units on the + accelerator, per normalization unit. Typically unused as these memory operations + are typically used to implement thread-local storage. + Spill/Stack Total Cycles: The number of cycles the address processing unit spent + working on spill/stack instructions, per normalization unit. + Spill/Stack Coalesced Read: The number of cycles the address processing unit spent + working on coalesced spill/stack read instructions, per normalization unit. + Spill/Stack Coalesced Write: The number of cycles the address processing unit + spent working on coalesced spill/stack write instructions, per normalization + unit. + Data-Return Busy: Percent of the total CU cycles the data-return unit was busy + processing or waiting on data to return to the CU. + "Cache RAM \u2192 Data-Return Stall": Percent of the total CU cycles the data-return + unit was stalled on data to be returned from the vL1D Cache RAM. + "Workgroup manager \u2192 Data-Return Stall": Percent of the total CU cycles the + data-return unit was stalled by the workgroup manager due to initialization + of registers as a part of launching new workgroups. + Coalescable Instructions: The number of instructions submitted to the data-return + unit by the address processor that were found to be coalescable, per normalization + unit. + Read Instructions: The number of read instructions submitted to the data-return + unit by the address processor summed over all compute units on the accelerator, + per normalization unit. This is expected to be the sum of global/generic and + spill/stack reads in the address processor. + Write Instructions: The number of store instructions submitted to the data-return + unit by the address processor summed over all compute units on the accelerator, + per normalization unit. This is expected to be the sum of global/generic and + spill/stack stores in the address processor. + Atomic Instructions: The number of atomic instructions submitted to the data-return + unit by the address processor summed over all compute units on the accelerator, + per normalization unit. This is expected to be the sum of global/generic and + spill/stack atomics in the address processor. + data source: + - metric_table: + id: 1501 + title: Busy and stall metrics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Address Processing Unit Busy: + avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + Address Stall: + avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + unit: pct + Data Stall: + avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + unit: pct + "Data-Processor \u2192 Address Stall": + avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + unit: pct + - metric_table: + id: 1502 + title: Instruction counts + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Total Instructions: + avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom)) + min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom)) + max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Instructions: + avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Read Instructions: + avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Write Instructions: + avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Atomic Instructions: + avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Instructions: + avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Read Instructions: + avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Write Instructions: + avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Atomic Instructions: + avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + - metric_table: + id: 1503 + title: Spill and stack metrics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Spill/Stack Total Cycles: + avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) + min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) + max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) + unit: (Cycles + $normUnit) + Spill/Stack Coalesced Read: + avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) + min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) + max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) + unit: (Cycles + $normUnit) + Spill/Stack Coalesced Write: + avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) + min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) + max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) + unit: (Cycles + $normUnit) + - metric_table: + id: 1504 + title: Vector L1 data-return path or Texture Data (TD) + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Data-Return Busy: + avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + "Cache RAM \u2192 Data-Return Stall": + avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + "Workgroup manager \u2192 Data-Return Stall": + avg: null + min: null + max: null + unit: pct + Coalescable Instructions: + avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom)) + min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom)) + max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) + Read Instructions: + avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) + / $denom)) + min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) + / $denom)) + max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) + / $denom)) + unit: (Instructions + $normUnit) + Write Instructions: + avg: AVG((TD_STORE_WAVEFRONT_sum / $denom)) + min: MIN((TD_STORE_WAVEFRONT_sum / $denom)) + max: MAX((TD_STORE_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) + Atomic Instructions: + avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom)) + min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom)) + max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1600_L1_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1600_L1_cache.yaml deleted file mode 100644 index 452fa277ab..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1600_L1_cache.yaml +++ /dev/null @@ -1,414 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1600 - title: Vector L1 Data Cache - data source: - - metric_table: - id: 1601 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - tips: Tips - metric: - Hit rate: - value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - unit: Pct of Peak - tips: - Bandwidth: - value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk / 1000) * 64) * $cu_per_gpu)) - unit: Pct of Peak - tips: - Utilization: - value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None)) - unit: Pct of Peak - tips: - Coalescing: - value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum - * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None)) - unit: Pct of Peak - tips: - comparable: false # for now - cli_style: simple_bar - - - metric_table: - id: 1602 - title: L1D Cache Stalls (%) - header: - metric: Metric - expr: Expression - tips: Tips - metric: - Stalled on L2 Data: - expr: - (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None) - tips: - Stalled on L2 Req: - expr: - (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None) - tips: - Stalled on Address: - expr: - None - tips: - Stalled on Data: - expr: - None - tips: - Stalled on Latency FIFO: - expr: - None - tips: - Stalled on Request FIFO: - expr: - None - tips: - Stalled on Read Return: - expr: - None - tips: - Tag RAM Stall (Read): - expr: - (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None) - tips: - Tag RAM Stall (Write): - expr: - (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None) - tips: - Tag RAM Stall (Atomic): - expr: - (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None) - tips: - cli_style: simple_box - - - metric_table: - id: 1603 - title: L1D Cache Accesses - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Total Req: - avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom)) - min: MIN((TCP_TOTAL_ACCESSES_sum / $denom)) - max: MAX((TCP_TOTAL_ACCESSES_sum / $denom)) - unit: (Req + $normUnit) - tips: - Read Req: - avg: AVG((TCP_TOTAL_READ_sum / $denom)) - min: MIN((TCP_TOTAL_READ_sum / $denom)) - max: MAX((TCP_TOTAL_READ_sum / $denom)) - unit: (Req + $normUnit) - tips: - Write Req: - avg: AVG((TCP_TOTAL_WRITE_sum / $denom)) - min: MIN((TCP_TOTAL_WRITE_sum / $denom)) - max: MAX((TCP_TOTAL_WRITE_sum / $denom)) - unit: (Req + $normUnit) - tips: - Atomic Req: - avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)) - min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)) - max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)) - unit: (Req + $normUnit) - tips: - Cache BW: - avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / $denom)) - min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / $denom)) - max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / $denom)) - unit: (Bytes + $normUnit) - tips: - Cache Hit Rate: - avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / - TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / - TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / - TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - unit: pct - tips: - Cache Accesses: - avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) - min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) - max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) - unit: (Req + $normUnit) - tips: - Cache Hits: - avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / $denom)) - min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / $denom)) - max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / $denom)) - unit: (Req + $normUnit) - tips: - Invalidations: - avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) - min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) - max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) - unit: (Req + $normUnit) - tips: - L1-L2 BW: - avg: AVG(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) - + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) - min: MIN(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) - + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) - max: MAX(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) - + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) - unit: (Bytes + $normUnit) - tips: - L1-L2 Read: - avg: AVG((TCP_TCC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - L1-L2 Write: - avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - L1-L2 Atomic: - avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)) - min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)) - max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)) - unit: (Req + $normUnit) - tips: - L1 Access Latency: - avg: AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum - != 0) else None)) - min: MIN(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum - != 0) else None)) - max: MAX(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum - != 0) else None)) - unit: Cycles - tips: - L1-L2 Read Latency: - avg: AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) - if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else None)) - min: MIN(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) - if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else None)) - max: MAX(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) - if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else None)) - unit: Cycles - tips: - L1-L2 Write Latency: - avg: AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) else - None)) - min: MIN(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) else - None)) - max: MAX(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) else - None)) - unit: Cycles - tips: - - - metric_table: - id: 1604 - title: L1D - L2 Transactions - header: - metric: Metric - xfer: Xfer - coherency: Coherency - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - NC - Read: - xfer: Read - coherency: NC - avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - UC - Read: - xfer: Read - coherency: UC - avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - CC - Read: - xfer: Read - coherency: CC - avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - RW - Read: - xfer: Read - coherency: RW - avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - RW - Write: - xfer: Write - coherency: RW - avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - NC - Write: - xfer: Write - coherency: NC - avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - UC - Write: - xfer: Write - coherency: UC - avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - CC - Write: - xfer: Write - coherency: CC - avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - NC - Atomic: - xfer: Atomic - coherency: NC - avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - UC - Atomic: - xfer: Atomic - coherency: UC - avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - CC - Atomic: - xfer: Atomic - coherency: CC - avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - RW - Atomic: - xfer: Atomic - coherency: RW - avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - - - metric_table: - id: 1605 - title: L1D Addr Translation - header: - metric: Metric - avg: Avg - min: Min - max: Max - units: Units - tips: Tips - metric: - Req: - avg: AVG((TCP_UTCL1_REQUEST_sum / $denom)) - min: MIN((TCP_UTCL1_REQUEST_sum / $denom)) - max: MAX((TCP_UTCL1_REQUEST_sum / $denom)) - units: (Req + $normUnit) - tips: - Inflight Req: - avg: None # Missing perfmon - min: None # Missing perfmon - max: None # Missing perfmon - units: (Req + $normUnit) - tips: - Hit Ratio: - avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) if - (TCP_UTCL1_REQUEST_sum != 0) else None)) - min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) if - (TCP_UTCL1_REQUEST_sum != 0) else None)) - max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) if - (TCP_UTCL1_REQUEST_sum != 0) else None)) - units: pct - tips: - Hits: - avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) - min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) - max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) - units: (Req + $normUnit) - tips: - Translation Misses: - avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) - min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) - max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) - units: (Req + $normUnit) - tips: - Permission Misses: - avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) - min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) - max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) - units: (Req + $normUnit) - tips: - - metric_table: - id: 1606 - title: L1D Addr Translation Stalls - header: - metric: Metric - avg: Avg - min: Min - max: Max - units: Units - tips: Tips - metric: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1600_vector_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1600_vector_l1_data_cache.yaml new file mode 100644 index 0000000000..96e021e378 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1600_vector_l1_data_cache.yaml @@ -0,0 +1,442 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1600 + title: Vector L1 Data Cache + metrics_description: + Hit rate: The ratio of the number of vL1D cache line requests that hit in vL1D + cache over the total number of cache line requests to the vL1D Cache RAM. + Bandwidth: The number of bytes looked up in the vL1D cache as a result of VMEM + instructions, as a percent of the peak theoretical bandwidth achievable on the + specific accelerator. The number of bytes is calculated as the number of cache + lines requested multiplied by the cache line size. This value does not consider + partial requests, so for instance, if only a single value is requested in a + cache line, the data movement will still be counted as a full cache line. + Utilization: Indicates how busy the vL1D Cache RAM was during the kernel execution. + The number of cycles where the vL1D Cache RAM is actively processing any request + divided by the number of cycles where the vL1D is active. + Coalescing: Indicates how well memory instructions were coalesced by the address + processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). Calculated + as the average number of thread-requests generated per instruction divided by + the ideal number of thread-requests per instruction. + Stalled on L2 Data: The ratio of the number of cycles where the vL1D is stalled + waiting for requested data to return from the L2 cache divided by the number + of cycles where the vL1D is active. + Stalled on L2 Req: The ratio of the number of cycles where the vL1D is stalled + waiting to issue a request for data to the L2 cache divided by the number of + cycles where the vL1D is active. + Tag RAM Stall (Read): The ratio of the number of cycles where the vL1D is stalled + due to Read requests with conflicting tags being looked up concurrently, divided + by the number of cycles where the vL1D is active. + Tag RAM Stall (Write): The ratio of the number of cycles where the vL1D is stalled + due to Write requests with conflicting tags being looked up concurrently, divided + by the number of cycles where the vL1D is active. + Tag RAM Stall (Atomic): The ratio of the number of cycles where the vL1D is stalled + due to Atomic requests with conflicting tags being looked up concurrently, divided + by the number of cycles where the vL1D is active. + Total Req: The total number of incoming requests from the address processing unit + after coalescing. + Read Req: The total number of incoming read requests from the address processing + unit after coalescing per normalization unit. + Write Req: The total number of incoming write requests from the address processing + unit after coalescing per normalization unit. + Atomic Req: The total number of incoming atomic requests from the address processing + unit after coalescing per normalization unit. + Cache BW: The number of bytes looked up in the vL1D cache as a result of VMEM + instructions per normalization unit. The number of bytes is calculated as the + number of cache lines requested multiplied by the cache line size. This value + does not consider partial requests, so for instance, if only a single value + is requested in a cache line, the data movement will still be counted as a full + cache line. + Cache Hit Rate: The ratio of the number of vL1D cache line requests that hit in + vL1D cache over the total number of cache line requests to the vL1D Cache RAM. + Cache Accesses: The total number of cache line lookups in the vL1D. + Cache Hits: The number of cache accesses minus the number of outgoing requests + to the L2 cache, that is, the number of cache line requests serviced by the + vL1D Cache RAM per normalization unit. + Invalidations: The number of times the vL1D was issued a write-back invalidate + command during the kernel's execution per normalization unit. This may be triggered + by, for instance, the buffer_wbinvl1 instruction. + L1-L2 BW: The number of bytes transferred across the vL1D-L2 interface as a result + of VMEM instructions, per normalization unit. The number of bytes is calculated + as the number of cache lines requested multiplied by the cache line size. This + value does not consider partial requests, so for instance, if only a single + value is requested in a cache line, the data movement will still be counted + as a full cache line. + L1-L2 Read: The number of read requests for a vL1D cache line that were not satisfied + by the vL1D and must be retrieved from the to the L2 Cache per normalization + unit. + L1-L2 Write: The number of write requests to a vL1D cache line that were sent + through the vL1D to the L2 cache, per normalization unit. + L1-L2 Atomic: The number of atomic requests that are sent through the vL1D to + the L2 cache, per normalization unit. This includes requests for atomics with, + and without return. + L1 Access Latency: Calculated as the average number of cycles that a vL1D cache + line request spent in the vL1D cache pipeline. + L1-L2 Read Latency: Calculated as the average number of cycles that the vL1D cache + took to issue and receive read requests from the L2 Cache. This number also + includes requests for atomics with return values. + L1-L2 Write Latency: Calculated as the average number of cycles that the vL1D + cache took to issue and receive acknowledgement of a write request to the L2 + Cache. This number also includes requests for atomics without return values. + NC - Read: Total read requests with NC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + UC - Read: Total read requests with UC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + CC - Read: Total read requests with CC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + RW - Read: Total read requests with RW mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + RW - Write: Total write requests with RW mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + NC - Write: Total write requests with NC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + UC - Write: Total write requests with UC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + CC - Write: Total write requests with CC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + NC - Atomic: Total atomic requests with NC mtype from this TCP to all TCCs Sum + over TCP instances per normalization unit. + UC - Atomic: Total atomic requests with UC mtype from this TCP to all TCCs Sum + over TCP instances per normalization unit. + CC - Atomic: Total atomic requests with CC mtype from this TCP to all TCCs Sum + over TCP instances per normalization unit. + RW - Atomic: Total atomic requests with RW mtype from this TCP to all TCCs Sum + over TCP instances per normalization unit. + Req: The number of translation requests made to the UTCL1 per normalization unit. + Hit Ratio: The ratio of the number of translation requests that hit in the UTCL1 + divided by the total number of translation requests made to the UTCL1. + Hits: The number of translation requests that hit in the UTCL1, and could be reused, + per normalization unit. + Translation Misses: The total number of translation requests that missed in the + UTCL1 due to translation not being present in the cache, per normalization + unit. + Permission Misses: "The total number of translation requests that missed in the\ + \ UTCL1 due to a permission error, per normalization unit. This is unused and\ + \ expected to be zero in most configurations for modern CDNA\u2122 accelerators." + data source: + - metric_table: + id: 1601 + title: vL1D Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + metric: + Hit rate: + value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None)) + unit: Pct of Peak + Bandwidth: + value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk / 1000) * 64) * $cu_per_gpu)) + unit: Pct of Peak + Utilization: + value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None)) + unit: Pct of Peak + Coalescing: + value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum + * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None)) + unit: Pct of Peak + comparable: false + cli_style: simple_bar + tui_style: simple_bar + - metric_table: + id: 1602 + title: vL1D cache stall metrics + header: + metric: Metric + expr: Expression + metric: + Stalled on L2 Data: + expr: (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None) + Stalled on L2 Req: + expr: (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None) + Tag RAM Stall (Read): + expr: (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + Tag RAM Stall (Write): + expr: (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + Tag RAM Stall (Atomic): + expr: (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1603 + title: vL1D cache access metrics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Total Req: + avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom)) + min: MIN((TCP_TOTAL_ACCESSES_sum / $denom)) + max: MAX((TCP_TOTAL_ACCESSES_sum / $denom)) + unit: (Req + $normUnit) + Read Req: + avg: AVG((TCP_TOTAL_READ_sum / $denom)) + min: MIN((TCP_TOTAL_READ_sum / $denom)) + max: MAX((TCP_TOTAL_READ_sum / $denom)) + unit: (Req + $normUnit) + Write Req: + avg: AVG((TCP_TOTAL_WRITE_sum / $denom)) + min: MIN((TCP_TOTAL_WRITE_sum / $denom)) + max: MAX((TCP_TOTAL_WRITE_sum / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + unit: (Req + $normUnit) + Cache BW: + avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / $denom)) + min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / $denom)) + max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / $denom)) + unit: (Bytes + $normUnit) + Cache Hit Rate: + avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None)) + min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None)) + max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None)) + unit: pct + Cache Accesses: + avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + unit: (Req + $normUnit) + Cache Hits: + avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + unit: (Req + $normUnit) + Invalidations: + avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + unit: (Req + $normUnit) + L1-L2 BW: + avg: AVG(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) + min: MIN(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) + max: MAX(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) + unit: (Bytes + $normUnit) + L1-L2 Read: + avg: AVG((TCP_TCC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + L1-L2 Write: + avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + L1-L2 Atomic: + avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + unit: (Req + $normUnit) + L1 Access Latency: + avg: AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum + != 0) else None)) + min: MIN(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum + != 0) else None)) + max: MAX(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum + != 0) else None)) + unit: Cycles + L1-L2 Read Latency: + avg: AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) + if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else + None)) + min: MIN(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) + if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else + None)) + max: MAX(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) + if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else + None)) + unit: Cycles + L1-L2 Write Latency: + avg: AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) + else None)) + min: MIN(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) + else None)) + max: MAX(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) + else None)) + unit: Cycles + - metric_table: + id: 1604 + title: L1D - L2 Transactions + header: + metric: Metric + xfer: Xfer + coherency: Coherency + avg: Avg + min: Min + max: Max + unit: Unit + metric: + NC - Read: + xfer: Read + coherency: NC + avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC - Read: + xfer: Read + coherency: UC + avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC - Read: + xfer: Read + coherency: CC + avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW - Read: + xfer: Read + coherency: RW + avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW - Write: + xfer: Write + coherency: RW + avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + NC - Write: + xfer: Write + coherency: NC + avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC - Write: + xfer: Write + coherency: UC + avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC - Write: + xfer: Write + coherency: CC + avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + NC - Atomic: + xfer: Atomic + coherency: NC + avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC - Atomic: + xfer: Atomic + coherency: UC + avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC - Atomic: + xfer: Atomic + coherency: CC + avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW - Atomic: + xfer: Atomic + coherency: RW + avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + - metric_table: + id: 1605 + title: L1 Unified Translation Cache (UTCL1) + header: + metric: Metric + avg: Avg + min: Min + max: Max + units: Units + metric: + Req: + avg: AVG((TCP_UTCL1_REQUEST_sum / $denom)) + min: MIN((TCP_UTCL1_REQUEST_sum / $denom)) + max: MAX((TCP_UTCL1_REQUEST_sum / $denom)) + units: (Req + $normUnit) + Hit Ratio: + avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) + if (TCP_UTCL1_REQUEST_sum != 0) else None)) + min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) + if (TCP_UTCL1_REQUEST_sum != 0) else None)) + max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) + if (TCP_UTCL1_REQUEST_sum != 0) else None)) + units: pct + Hits: + avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + units: (Req + $normUnit) + Translation Misses: + avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + units: (Req + $normUnit) + Permission Misses: + avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + units: (Req + $normUnit) + - metric_table: + id: 1606 + title: L1D Addr Translation Stalls + header: + metric: Metric + avg: Avg + min: Min + max: Max + units: Units + metric: {} diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1700_L2_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1700_L2_cache.yaml deleted file mode 100644 index e1c1bffd0f..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1700_L2_cache.yaml +++ /dev/null @@ -1,388 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1700 - title: L2 Cache - data source: - - metric_table: - id: 1701 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - tips: Tips - metric: - Utilization: - value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD))) - unit: pct - tips: - Bandwidth: - value: ((100 * AVG(((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan))) - unit: pct - tips: - Hit Rate: - value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else 0)) - unit: pct - tips: - L2-Fabric Read BW: - value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) - * 64)) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - tips: - L2-Fabric Write and Atomic BW: - value: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - tips: - HBM Bandwidth: - value: $hbmBandwidth - unit: GB/s - tips: - - - metric_table: - id: 1702 - title: L2 - Fabric Transactions - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Read BW: - avg: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) - * 64)) / $denom)) - min: MIN((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) - * 64)) / $denom)) - max: MAX((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) - * 64)) / $denom)) - unit: (Bytes + $normUnit) - tips: - HBM Read Traffic: - avg: AVG((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) - unit: pct - tips: - Remote Read Traffic: - avg: AVG((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) - min: MIN((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) - max: MAX((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) - unit: pct - tips: - Uncached Read Traffic: - avg: AVG((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) - unit: pct - tips: - Write and Atomic BW: - avg: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) - * 32)) / $denom)) - min: MIN((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) - * 32)) / $denom)) - max: MAX((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) - * 32)) / $denom)) - unit: (Bytes + $normUnit) - tips: - HBM Write and Atomic Traffic: - avg: AVG((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) - unit: pct - tips: - Remote Write and Atomic Traffic: - avg: AVG((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) - min: MIN((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) - max: MAX((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) - unit: pct - tips: - Atomic Traffic: - avg: AVG((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) - unit: pct - tips: - Uncached Write and Atomic Traffic: - avg: AVG((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) - unit: pct - tips: - Read Latency: - avg: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != - 0) else None)) - min: MIN(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != - 0) else None)) - max: MAX(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != - 0) else None)) - unit: Cycles - tips: - Write and Atomic Latency: - avg: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != - 0) else None)) - min: MIN(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != - 0) else None)) - max: MAX(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != - 0) else None)) - unit: Cycles - tips: - Atomic Latency: - avg: AVG(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum - != 0) else None)) - min: MIN(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum - != 0) else None)) - max: MAX(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum - != 0) else None)) - unit: Cycles - tips: - - - metric_table: - id: 1703 - title: L2 Cache Accesses - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Bandwidth: - avg: AVG((TCC_REQ_sum * 64) / $denom) - min: MIN((TCC_REQ_sum * 64) / $denom) - max: MAX((TCC_REQ_sum * 64) / $denom) - unit: (Bytes + $normUnit) - tips: - Req: - avg: AVG((TCC_REQ_sum / $denom)) - min: MIN((TCC_REQ_sum / $denom)) - max: MAX((TCC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - Read Req: - avg: AVG((TCC_READ_sum / $denom)) - min: MIN((TCC_READ_sum / $denom)) - max: MAX((TCC_READ_sum / $denom)) - unit: (Req + $normUnit) - tips: - Write Req: - avg: AVG((TCC_WRITE_sum / $denom)) - min: MIN((TCC_WRITE_sum / $denom)) - max: MAX((TCC_WRITE_sum / $denom)) - unit: (Req + $normUnit) - tips: - Atomic Req: - avg: AVG((TCC_ATOMIC_sum / $denom)) - min: MIN((TCC_ATOMIC_sum / $denom)) - max: MAX((TCC_ATOMIC_sum / $denom)) - unit: (Req + $normUnit) - tips: - Streaming Req: - avg: AVG((TCC_STREAMING_REQ_sum / $denom)) - min: MIN((TCC_STREAMING_REQ_sum / $denom)) - max: MAX((TCC_STREAMING_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - Probe Req: - avg: AVG((TCC_PROBE_sum / $denom)) - min: MIN((TCC_PROBE_sum / $denom)) - max: MAX((TCC_PROBE_sum / $denom)) - unit: (Req + $normUnit) - tips: - Cache Hit: - avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - unit: pct - tips: - Hits: - avg: AVG((TCC_HIT_sum / $denom)) - min: MIN((TCC_HIT_sum / $denom)) - max: MAX((TCC_HIT_sum / $denom)) - unit: (Hits + $normUnit) - tips: - Misses: - avg: AVG((TCC_MISS_sum / $denom)) - min: MIN((TCC_MISS_sum / $denom)) - max: MAX((TCC_MISS_sum / $denom)) - unit: (Misses + $normUnit) - tips: - Writeback: - avg: AVG((TCC_WRITEBACK_sum / $denom)) - min: MIN((TCC_WRITEBACK_sum / $denom)) - max: MAX((TCC_WRITEBACK_sum / $denom)) - unit: (Cachelines + $normUnit) - tips: - Writeback (Internal): - avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom)) - min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom)) - max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom)) - unit: (Cachelines + $normUnit) - tips: - Writeback (vL1D Req): - avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - unit: (Cachelines + $normUnit) - tips: - Evict (Internal): - avg: AVG((TCC_NORMAL_EVICT_sum / $denom)) - min: MIN((TCC_NORMAL_EVICT_sum / $denom)) - max: MAX((TCC_NORMAL_EVICT_sum / $denom)) - unit: (Cachelines + $normUnit) - tips: - Evict (vL1D Req): - avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - unit: (Cachelines + $normUnit) - tips: - NC Req: - avg: AVG((TCC_NC_REQ_sum / $denom)) - min: MIN((TCC_NC_REQ_sum / $denom)) - max: MAX((TCC_NC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - UC Req: - avg: AVG((TCC_UC_REQ_sum / $denom)) - min: MIN((TCC_UC_REQ_sum / $denom)) - max: MAX((TCC_UC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - CC Req: - avg: AVG((TCC_CC_REQ_sum / $denom)) - min: MIN((TCC_CC_REQ_sum / $denom)) - max: MAX((TCC_CC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - RW Req: - avg: AVG((TCC_RW_REQ_sum / $denom)) - min: MIN((TCC_RW_REQ_sum / $denom)) - max: MAX((TCC_RW_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - - - metric_table: - id: 1704 - title: L2 Cache Stalls - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - - - metric_table: - id: 1705 - title: L2 - Fabric Interface Stalls - header: - metric: Metric - type: Type - transaction: Transaction - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - style: - type: simple_multi_bar - metric: - Write - Credit Starvation: - type: Credit Starvation - transaction: Write - avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - unit: pct - tips: - - - metric_table: - id: 1706 - title: L2 - Fabric Detailed Transaction Breakdown - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Read (32B): - avg: AVG((TCC_EA_RDREQ_32B_sum / $denom)) - min: MIN((TCC_EA_RDREQ_32B_sum / $denom)) - max: MAX((TCC_EA_RDREQ_32B_sum / $denom)) - unit: (Req + $normUnit) - tips: - Read (64B): - avg: AVG(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) - min: MIN(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) - max: MAX(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) - unit: (Req + $normUnit) - tips: - Read (Uncached): - avg: AVG((TCC_EA_RD_UNCACHED_32B_sum / $denom)) - min: MIN((TCC_EA_RD_UNCACHED_32B_sum / $denom)) - max: MAX((TCC_EA_RD_UNCACHED_32B_sum / $denom)) - unit: (Req + $normUnit) - tips: - HBM Read: - avg: AVG((TCC_EA_RDREQ_DRAM_sum / $denom)) - min: MIN((TCC_EA_RDREQ_DRAM_sum / $denom)) - max: MAX((TCC_EA_RDREQ_DRAM_sum / $denom)) - unit: (Req + $normUnit) - tips: - Remote Read: - avg: AVG((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom)) - unit: (Req + $normUnit) - tips: - Write and Atomic (32B): - avg: AVG(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) - min: MIN(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) - max: MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) - unit: (Req + $normUnit) - tips: - Write and Atomic (Uncached): - avg: AVG((TCC_EA_WR_UNCACHED_32B_sum / $denom)) - min: MIN((TCC_EA_WR_UNCACHED_32B_sum / $denom)) - max: MAX((TCC_EA_WR_UNCACHED_32B_sum / $denom)) - unit: (Req + $normUnit) - tips: - Write and Atomic (64B): - avg: AVG((TCC_EA_WRREQ_64B_sum / $denom)) - min: MIN((TCC_EA_WRREQ_64B_sum / $denom)) - max: MAX((TCC_EA_WRREQ_64B_sum / $denom)) - unit: (Req + $normUnit) - tips: - HBM Write and Atomic: - avg: AVG((TCC_EA_WRREQ_DRAM_sum / $denom)) - min: MIN((TCC_EA_WRREQ_DRAM_sum / $denom)) - max: MAX((TCC_EA_WRREQ_DRAM_sum / $denom)) - unit: (Req + $normUnit) - tips: - Remote Write and Atomic: - avg: AVG((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom)) - unit: (Req + $normUnit) - tips: - Atomic: - avg: AVG((TCC_EA_ATOMIC_sum / $denom)) - min: MIN((TCC_EA_ATOMIC_sum / $denom)) - max: MAX((TCC_EA_ATOMIC_sum / $denom)) - unit: (Req + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1700_l2_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1700_l2_cache.yaml new file mode 100644 index 0000000000..d9bc1ca1a9 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1700_l2_cache.yaml @@ -0,0 +1,536 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1700 + title: L2 Cache + metrics_description: + Utilization: The ratio of the number of cycles an L2 channel was active, summed + over all L2 channels on the accelerator over the total L2 cycles. + Peak Bandwidth: The number of bytes looked up in the L2 cache, as a percent of + the peak theoretical bandwidth achievable on the specific accelerator. The number + of bytes is calculated as the number of cache lines requested multiplied by + the cache line size. This value does not consider partial requests, so e.g., + if only a single value is requested in a cache line, the data movement will + still be counted as a full cache line. + Hit Rate: The ratio of the number of L2 cache line requests that hit in the L2 + cache over the total number of incoming cache line requests to the L2 cache. + L2-Fabric Read BW: The number of bytes read by the L2 over the Infinity Fabric + interface per unit time. + L2-Fabric Write and Atomic BW: The number of bytes sent by the L2 over the Infinity + Fabric interface by write and atomic operations per unit time. + HBM Bandwidth: Maximum theoretical bandwidth of the accelerator's local high-bandwidth + memory (HBM) per unit time. This value is calculated as the number of HBM channels + multiplied by the HBM channel width multiplied by the HBM clock frequency. + Read BW: The total number of bytes read by the L2 cache from Infinity Fabric per + normalization unit. + HBM Read Traffic: The percent of read requests generated by the L2 cache that + are routed to the accelerator's local high-bandwidth memory (HBM). This breakdown + does not consider the size of the request (meaning that 32B and 64B requests + are both counted as a single request), so this metric only approximates the + percent of the L2-Fabric Read bandwidth directed to the local HBM. + Remote Read Traffic: The percent of read requests generated by the L2 cache that + are routed to any memory location other than the accelerator's local high-bandwidth + memory (HBM) - for example, the CPU's DRAM or a remote accelerator's HBM. This + breakdown does not consider the size of the request (meaning that 32B and 64B + requests are both counted as a single request), so this metric only approximates + the percent of the L2-Fabric Read bandwidth directed to a remote location. + Uncached Read Traffic: The percent of read requests generated by the L2 cache + that are reading from an uncached memory allocation. Note, as described in the + request flow section, a single 64B read request is typically counted as two + uncached read requests. So, it is possible for the Uncached Read Traffic to + reach up to 200% of the total number of read requests. This breakdown does not + consider the size of the request (i.e., 32B and 64B requests are both counted + as a single request), so this metric only approximates the percent of the L2-Fabric + read bandwidth directed to an uncached memory location. + Write and Atomic BW: The total number of bytes written by the L2 over Infinity + Fabric by write and atomic operations per normalization unit. Note that on current + CDNA accelerators, such as the MI2XX, requests are only considered atomic by + Infinity Fabric if they are targeted at non-write-cacheable memory, for example, + fine-grained memory allocations or uncached memory allocations on the MI2XX. + HBM Write and Atomic Traffic: The percent of write and atomic requests generated + by the L2 cache that are routed to the accelerator's local high-bandwidth memory + (HBM). This breakdown does not consider the size of the request (meaning that + 32B and 64B requests are both counted as a single request), so this metric only + approximates the percent of the L2-Fabric Write and Atomic bandwidth directed + to the local HBM. Note that on current CDNA accelerators, such as the MI2XX, + requests are only considered atomic by Infinity Fabric if they are targeted + at fine-grained memory allocations or uncached memory allocations. + Remote Write and Atomic Traffic: The percent of read requests generated by the + L2 cache that are routed to any memory location other than the accelerator's + local high-bandwidth memory (HBM) - for example, the CPU's DRAM or a remote + accelerator's HBM. This breakdown does not consider the size of the request + (meaning that 32B and 64B requests are both counted as a single request), so + this metric only approximates the percent of the L2-Fabric Read bandwidth directed + to a remote location. Note that on current CDNA accelerators, such as the MI2XX, + requests are only considered atomic by Infinity Fabric if they are targeted + at fine-grained memory allocations or uncached memory allocations. + Atomic Traffic: The percent of write requests generated by the L2 cache that are + atomic requests to any memory location. This breakdown does not consider the + size of the request (meaning that 32B and 64B requests are both counted as a + single request), so this metric only approximates the percent of the L2-Fabric + Read bandwidth directed to a remote location. Note that on current CDNA accelerators, + such as the MI2XX, requests are only considered atomic by Infinity Fabric if + they are targeted at fine-grained memory allocations or uncached memory allocations. + Uncached Write and Atomic Traffic: The percent of write and atomic requests generated + by the L2 cache that are targeting uncached memory allocations. This breakdown + does not consider the size of the request (meaning that 32B and 64B requests + are both counted as a single request), so this metric only approximates the + percent of the L2-Fabric read bandwidth directed to uncached memory allocations. + Read Latency: The time-averaged number of cycles read requests spent in Infinity + Fabric before data was returned to the L2. + Write and Atomic Latency: The time-averaged number of cycles write requests spent + in Infinity Fabric before a completion acknowledgement was returned to the L2. + Atomic Latency: The time-averaged number of cycles atomic requests spent in Infinity + Fabric before a completion acknowledgement (atomic without return value) or + data (atomic with return value) was returned to the L2. + Bandwidth: The number of bytes looked up in the L2 cache, per normalization unit. + The number of bytes is calculated as the number of cache lines requested multiplied + by the cache line size. This value does not consider partial requests, so for + example, if only a single value is requested in a cache line, the data movement + will still be counted as a full cache line. + Req: The total number of incoming requests to the L2 from all clients for all + request types, per normalization unit. + Read Req: The total number of read requests to the L2 from all clients. + Write Req: The total number of write requests to the L2 from all clients. + Atomic Req: The total number of atomic requests (with and without return) to the + L2 from all clients. + Streaming Req: The total number of incoming requests to the L2 that are marked + as streaming. The exact meaning of this may differ depending on the targeted + accelerator, however on an MI2XX this corresponds to non-temporal load or stores. + The L2 cache attempts to evict streaming requests before normal requests when + the L2 is at capacity. + Probe Req: The number of coherence probe requests made to the L2 cache from outside + the accelerator. On an MI2XX, probe requests may be generated by, for example, + writes to fine-grained device memory or by writes to coarse-grained device memory. + Cache Hit: The ratio of the number of L2 cache line requests that hit in the L2 + cache over the total number of incoming cache line requests to the L2 cache. + Hits: The total number of requests to the L2 from all clients that hit in the + cache. As noted in the Speed-of-Light section, this includes hit-on-miss requests. + Misses: The total number of requests to the L2 from all clients that miss in the + cache. As noted in the Speed-of-Light section, these do not include hit-on-miss + requests. + Writeback: The total number of L2 cache lines written back to memory for any reason. + Write-backs may occur due to user code (such as HIP kernel calls to _threadfence_system + or atomic built-ins) by the command processor's memory acquire/release fences, + or for other internal hardware reasons. + Writeback (Internal): The total number of L2 cache lines written back to memory + for internal hardware reasons, per normalization unit. + Writeback (vL1D Req): The total number of L2 cache lines written back to memory + due to requests initiated by the vL1D cache, per normalization unit. + Evict (Internal): The total number of L2 cache lines evicted from the cache due + to capacity limits, per normalization unit. + Evict (vL1D Req): The total number of L2 cache lines evicted from the cache due + to invalidation requests initiated by the vL1D cache, per normalization unit. + NC Req: The total number of requests to the L2 to Not-hardware-Coherent (NC) memory + allocations, per normalization unit. + UC Req: The total number of requests to the L2 that go to Uncached (UC) memory + allocations. + CC Req: The total number of requests to the L2 that go to Coherently Cacheable + (CC) memory allocations. + RW Req: The total number of requests to the L2 that go to Read-Write coherent + memory (RW) allocations. + Write - Credit Starvation: The number of cycles the L2-Fabric interface was stalled + on write or atomic requests to any memory location because too many write/atomic + requests were currently in flight, as a percent of the total active L2 cycles. + Read (32B): The total number of L2 requests to Infinity Fabric to read 32B of + data from any memory location, per normalization unit. + Read (64B): The total number of L2 requests to Infinity Fabric to read 64B of + data from any memory location, per normalization unit. + Read (Uncached): The total number of L2 requests to Infinity Fabric to read uncached + data from any memory location, per normalization unit. 64B requests for uncached + data are counted as two 32B uncached data requests. + HBM Read: The total number of L2 requests to Infinity Fabric to read 32B or 64B + of data from the accelerator's local HBM, per normalization unit. + Remote Read: The total number of L2 requests to Infinity Fabric to read 32B or + 64B of data from any source other than the accelerator's local HBM, per normalization + unit. + Write and Atomic (32B): The total number of L2 requests to Infinity Fabric to + write or atomically update 32B of data to any memory location, per normalization + unit. + Write and Atomic (Uncached): The total number of L2 requests to Infinity Fabric + to write or atomically update 32B or 64B of uncached data, per normalization + unit. + Write and Atomic (64B): The total number of L2 requests to Infinity Fabric to + write or atomically update 64B of data in any memory location, per normalization + unit. + HBM Write and Atomic: The total number of L2 requests to Infinity Fabric to write + or atomically update 32B or 64B of data in the accelerator's local HBM, per + normalization unit. + Remote Write and Atomic: The total number of L2 requests to Infinity Fabric to + write or atomically update 32B or 64B of data in any memory location other than + the accelerator's local HBM, per normalization unit. + Atomic: The total number of L2 requests to Infinity Fabric to atomically update + 32B or 64B of data in any memory location, per normalization unit. See Request + flow for more detail. Note that on current CDNA accelerators, such as the MI2XX, + requests are only considered atomic by Infinity Fabric if they are targeted + at non-write-cacheable memory, such as fine-grained memory allocations or uncached + memory allocations on the MI2XX. + Read Stall: "The ratio of the total number of cycles the L2-Fabric interface was\ + \ stalled on a read request to any destination (local HBM, remote PCIe\xAE connected\ + \ accelerator or CPU, or remote Infinity Fabric connected accelerator or CPU)\ + \ over the total active L2 cycles." + Write Stall: The ratio of the total number of cycles the L2-Fabric interface was + stalled on a write or atomic request to any destination (local HBM, remote accelerator + or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected + accelerator or CPU) over the total active L2 cycles. + Read - PCIe Stall: The number of cycles the L2-Fabric interface was stalled on + read requests to remote PCIe connected accelerators or CPUs as a percent of + the total active L2 cycles. + Read - Infinity Fabric Stall: The number of cycles the L2-Fabric interface was + stalled on read requests to remote Infinity Fabric connected accelerators or + CPUs as a percent of the total active L2 cycles. + Read - HBM Stall: The number of cycles the L2-Fabric interface was stalled on + read requests to the accelerator's local HBM as a percent of the total active + L2 cycles. + Write - PCIe Stall: The number of cycles the L2-Fabric interface was stalled on + write or atomic requests to remote PCIe connected accelerators or CPUs as a + percent of the total active L2 cycles. + Write - Infinity Fabric Stall: The number of cycles the L2-Fabric interface was + stalled on write or atomic requests to remote Infinity Fabric connected accelerators + or CPUs as a percent of the total active L2 cycles. + Write - HBM Stall: The number of cycles the L2-Fabric interface was stalled on + write or atomic requests to accelerator's local HBM as a percent of the total + active L2 cycles. + data source: + - metric_table: + id: 1701 + title: L2 Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + metric: + Utilization: + value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD))) + unit: pct + Peak Bandwidth: + value: ((100 * AVG(((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp)))) + / ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan))) + unit: pct + Hit Rate: + value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else 0)) + unit: pct + L2-Fabric Read BW: + value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) + * 64)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + L2-Fabric Write and Atomic BW: + value: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) + * 32)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + HBM Bandwidth: + value: $hbmBandwidth + unit: GB/s + - metric_table: + id: 1702 + title: L2-Fabric interface metrics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Read BW: + avg: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) + * 64)) / $denom)) + min: MIN((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) + * 64)) / $denom)) + max: MAX((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) + * 64)) / $denom)) + unit: (Bytes + $normUnit) + HBM Read Traffic: + avg: AVG((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + unit: pct + Remote Read Traffic: + avg: AVG((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) + if (TCC_EA_RDREQ_sum != 0) else None)) + min: MIN((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) + if (TCC_EA_RDREQ_sum != 0) else None)) + max: MAX((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) + if (TCC_EA_RDREQ_sum != 0) else None)) + unit: pct + Uncached Read Traffic: + avg: AVG((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + unit: pct + Write and Atomic BW: + avg: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) + * 32)) / $denom)) + min: MIN((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) + * 32)) / $denom)) + max: MAX((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) + * 32)) / $denom)) + unit: (Bytes + $normUnit) + HBM Write and Atomic Traffic: + avg: AVG((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + unit: pct + Remote Write and Atomic Traffic: + avg: AVG((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) + if (TCC_EA_WRREQ_sum != 0) else None)) + min: MIN((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) + if (TCC_EA_WRREQ_sum != 0) else None)) + max: MAX((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) + if (TCC_EA_WRREQ_sum != 0) else None)) + unit: pct + Atomic Traffic: + avg: AVG((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + unit: pct + Uncached Write and Atomic Traffic: + avg: AVG((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + unit: pct + Read Latency: + avg: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + min: MIN(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + max: MAX(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + unit: Cycles + Write and Atomic Latency: + avg: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + min: MIN(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + max: MAX(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + unit: Cycles + Atomic Latency: + avg: AVG(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum + != 0) else None)) + min: MIN(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum + != 0) else None)) + max: MAX(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum + != 0) else None)) + unit: Cycles + - metric_table: + id: 1703 + title: L2 Cache Accesses + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Bandwidth: + avg: AVG((TCC_REQ_sum * 64) / $denom) + min: MIN((TCC_REQ_sum * 64) / $denom) + max: MAX((TCC_REQ_sum * 64) / $denom) + unit: (Bytes + $normUnit) + Req: + avg: AVG((TCC_REQ_sum / $denom)) + min: MIN((TCC_REQ_sum / $denom)) + max: MAX((TCC_REQ_sum / $denom)) + unit: (Req + $normUnit) + Read Req: + avg: AVG((TCC_READ_sum / $denom)) + min: MIN((TCC_READ_sum / $denom)) + max: MAX((TCC_READ_sum / $denom)) + unit: (Req + $normUnit) + Write Req: + avg: AVG((TCC_WRITE_sum / $denom)) + min: MIN((TCC_WRITE_sum / $denom)) + max: MAX((TCC_WRITE_sum / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG((TCC_ATOMIC_sum / $denom)) + min: MIN((TCC_ATOMIC_sum / $denom)) + max: MAX((TCC_ATOMIC_sum / $denom)) + unit: (Req + $normUnit) + Streaming Req: + avg: AVG((TCC_STREAMING_REQ_sum / $denom)) + min: MIN((TCC_STREAMING_REQ_sum / $denom)) + max: MAX((TCC_STREAMING_REQ_sum / $denom)) + unit: (Req + $normUnit) + Probe Req: + avg: AVG((TCC_PROBE_sum / $denom)) + min: MIN((TCC_PROBE_sum / $denom)) + max: MAX((TCC_PROBE_sum / $denom)) + unit: (Req + $normUnit) + Cache Hit: + avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + unit: pct + Hits: + avg: AVG((TCC_HIT_sum / $denom)) + min: MIN((TCC_HIT_sum / $denom)) + max: MAX((TCC_HIT_sum / $denom)) + unit: (Hits + $normUnit) + Misses: + avg: AVG((TCC_MISS_sum / $denom)) + min: MIN((TCC_MISS_sum / $denom)) + max: MAX((TCC_MISS_sum / $denom)) + unit: (Misses + $normUnit) + Writeback: + avg: AVG((TCC_WRITEBACK_sum / $denom)) + min: MIN((TCC_WRITEBACK_sum / $denom)) + max: MAX((TCC_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + Writeback (Internal): + avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom)) + min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom)) + max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + Writeback (vL1D Req): + avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + Evict (Internal): + avg: AVG((TCC_NORMAL_EVICT_sum / $denom)) + min: MIN((TCC_NORMAL_EVICT_sum / $denom)) + max: MAX((TCC_NORMAL_EVICT_sum / $denom)) + unit: (Cachelines + $normUnit) + Evict (vL1D Req): + avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + unit: (Cachelines + $normUnit) + NC Req: + avg: AVG((TCC_NC_REQ_sum / $denom)) + min: MIN((TCC_NC_REQ_sum / $denom)) + max: MAX((TCC_NC_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC Req: + avg: AVG((TCC_UC_REQ_sum / $denom)) + min: MIN((TCC_UC_REQ_sum / $denom)) + max: MAX((TCC_UC_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC Req: + avg: AVG((TCC_CC_REQ_sum / $denom)) + min: MIN((TCC_CC_REQ_sum / $denom)) + max: MAX((TCC_CC_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW Req: + avg: AVG((TCC_RW_REQ_sum / $denom)) + min: MIN((TCC_RW_REQ_sum / $denom)) + max: MAX((TCC_RW_REQ_sum / $denom)) + unit: (Req + $normUnit) + - metric_table: + id: 1704 + title: L2 Cache Stalls + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: {} + - metric_table: + id: 1705 + title: L2 - Fabric Interface stalls + header: + metric: Metric + type: Type + transaction: Transaction + avg: Avg + min: Min + max: Max + unit: Unit + style: + type: simple_multi_bar + metric: + Write - Credit Starvation: + type: Credit Starvation + transaction: Write + avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum + != 0) else None)) + min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum + != 0) else None)) + max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum + != 0) else None)) + unit: pct + - metric_table: + id: 1706 + title: L2 - Fabric interface detailed metrics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Read (32B): + avg: AVG((TCC_EA_RDREQ_32B_sum / $denom)) + min: MIN((TCC_EA_RDREQ_32B_sum / $denom)) + max: MAX((TCC_EA_RDREQ_32B_sum / $denom)) + unit: (Req + $normUnit) + Read (64B): + avg: AVG(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) + min: MIN(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) + max: MAX(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) + unit: (Req + $normUnit) + Read (Uncached): + avg: AVG((TCC_EA_RD_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA_RD_UNCACHED_32B_sum / $denom)) + max: MAX((TCC_EA_RD_UNCACHED_32B_sum / $denom)) + unit: (Req + $normUnit) + HBM Read: + avg: AVG((TCC_EA_RDREQ_DRAM_sum / $denom)) + min: MIN((TCC_EA_RDREQ_DRAM_sum / $denom)) + max: MAX((TCC_EA_RDREQ_DRAM_sum / $denom)) + unit: (Req + $normUnit) + Remote Read: + avg: AVG((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom)) + min: MIN((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom)) + max: MAX((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom)) + unit: (Req + $normUnit) + Write and Atomic (32B): + avg: AVG(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) + min: MIN(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) + max: MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) + unit: (Req + $normUnit) + Write and Atomic (Uncached): + avg: AVG((TCC_EA_WR_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA_WR_UNCACHED_32B_sum / $denom)) + max: MAX((TCC_EA_WR_UNCACHED_32B_sum / $denom)) + unit: (Req + $normUnit) + Write and Atomic (64B): + avg: AVG((TCC_EA_WRREQ_64B_sum / $denom)) + min: MIN((TCC_EA_WRREQ_64B_sum / $denom)) + max: MAX((TCC_EA_WRREQ_64B_sum / $denom)) + unit: (Req + $normUnit) + HBM Write and Atomic: + avg: AVG((TCC_EA_WRREQ_DRAM_sum / $denom)) + min: MIN((TCC_EA_WRREQ_DRAM_sum / $denom)) + max: MAX((TCC_EA_WRREQ_DRAM_sum / $denom)) + unit: (Req + $normUnit) + Remote Write and Atomic: + avg: AVG((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom)) + min: MIN((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom)) + max: MAX((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom)) + unit: (Req + $normUnit) + Atomic: + avg: AVG((TCC_EA_ATOMIC_sum / $denom)) + min: MIN((TCC_EA_ATOMIC_sum / $denom)) + max: MAX((TCC_EA_ATOMIC_sum / $denom)) + unit: (Req + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1800_L2_cache_per_channel.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1800_L2_cache_per_channel.yaml deleted file mode 100644 index a787f360cf..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1800_L2_cache_per_channel.yaml +++ /dev/null @@ -1,350 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1800 - title: L2 Cache (per Channel) - data source: - - metric_table: - id: 1801 - title: Aggregate Stats (All channels) - header: - metric: Metric - avg: Avg - std dev: Std Dev - min: Min - max: Max - unit: Unit - tips: Tips - metric: - L2 Cache Hit Rate: - avg: AVG(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) - + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * - TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) - + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 - * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) - + (100 * TCC_HIT[16])) + (100 * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 - * TCC_HIT[19])) + (100 * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) - + (100 * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + (100 - * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + (100 * TCC_HIT[29])) - + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) - + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] - + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) - + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] - + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) - + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] - + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) - + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] - + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) - + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] - + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) - + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[29] + TCC_HIT[29])) + (TCC_MISS[30] - + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None)) - std dev: STD(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) - + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * - TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) - + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 - * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) - + (100 * TCC_HIT[16])) + (100 * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 - * TCC_HIT[19])) + (100 * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) - + (100 * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + (100 - * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + (100 * TCC_HIT[29])) - + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) - + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] - + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) - + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] - + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) - + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] - + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) - + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] - + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) - + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] - + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) - + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] - + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None)) - min: MIN(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) - + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * - TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) - + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 - * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) - + (100 * TCC_HIT[16])) + (100 * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 - * TCC_HIT[19])) + (100 * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) - + (100 * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + (100 - * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + (100 * TCC_HIT[29])) - + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) - + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] - + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) - + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] - + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) - + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] - + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) - + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] - + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) - + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] - + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) - + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] - + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None)) - max: MAX(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) - + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * - TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) - + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 - * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) - + (100 * TCC_HIT[16])) + (100 * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 - * TCC_HIT[19])) + (100 * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) - + (100 * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + (100 - * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + (100 * TCC_HIT[29])) - + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) - + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] - + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) - + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] - + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) - + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] - + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) - + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] - + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) - + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] - + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) - + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] - + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None)) - unit: pct - tips: - # FIXME: other arggr metrics!! - - - metric_table: - id: 1802 - title: L2 Cache Hit Rate (pct) - header: - metric: Channel - expr: Expression - metric: - "::_1": - expr: - (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1] - + TCC_MISS[::_1]) != 0) else None) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_box - - - metric_table: - id: 1803 - title: L2 Requests (per normUnit) - header: - metric: Channel - expr: Expression - metric: - "::_1": - expr: (TO_INT(TCC_REQ[::_1]) / $denom) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_box - - - metric_table: - id: 1804 - title: L2 Requests (per normUnit) - header: - metric: Channel - read req: L2 Read - write req: L2 Write - atomic req: L2 Atomic - metric: - "::_1": - read req: AVG((TO_INT(TCC_READ[::_1]) / $denom)) - write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom)) - atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom)) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_multiple_bar - - - metric_table: - id: 1805 - title: L2-Fabric Requests (per normUnit) - header: - metric: Channel - read req: L2-Fabric Read - write req: L2-Fabric Write and Atomic - atomic req: L2-Fabric Atomic - metric: - "::_1": - read req: AVG((TO_INT(TCC_EA_RDREQ[::_1]) / $denom)) - write req: AVG((TO_INT(TCC_EA_WRREQ[::_1]) / $denom)) - atomic req: AVG((TO_INT(TCC_EA_ATOMIC[::_1]) / $denom)) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_multiple_bar - - # - metric_table: - # id: 1806 - # title: L2-EA Latency (Cycles) - # header: - # metric: Metric - # read lat: L2-EA Read - # write lat: L2-EA Write - # atomic lat: L2-EA Atomic - # metric: - # "::_1": - # read lat: - # AVG(((TCC_EA_RDREQ_LEVEL[::_1] / TCC_EA_RDREQ[::_1]) if (TCC_EA_RDREQ[::_1] - # != 0) else None)) - # write lat: - # AVG(((TCC_EA_WRREQ_LEVEL[::_1] / TCC_EA_WRREQ[::_1]) if (TCC_EA_WRREQ[::_1] - # != 0) else None)) - # atomic lat: - # AVG(((TCC_EA_ATOMIC_LEVEL[::_1] / TCC_EA_ATOMIC[::_1]) if - # (TCC_EA_ATOMIC[::_1] != 0) else 0)) - # placeholder_range: - # "::_1": 32 - # cli_style: simple_multiple_bar - - - metric_table: - id: 1806 - title: L2-Fabric Read Latency (Cycles) - header: - metric: Channel - expr: Expression - metric: - "::_1": - expr: - ((TCC_EA_RDREQ_LEVEL[::_1] / TCC_EA_RDREQ[::_1]) if (TCC_EA_RDREQ[::_1] - != 0) else None) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_box - - - metric_table: - id: 1807 - title: L2-Fabric Write and Atomic Latency (Cycles) - header: - metric: Channel - expr: Expression - metric: - "::_1": - expr: - ((TCC_EA_WRREQ_LEVEL[::_1] / TCC_EA_WRREQ[::_1]) if (TCC_EA_WRREQ[::_1] - != 0) else None) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_box - - - metric_table: - id: 1808 - title: L2-Fabric Atomic Latency (Cycles) - header: - metric: Channel - expr: Expression - metric: - "::_1": - expr: ((TCC_EA_ATOMIC_LEVEL[::_1] / TCC_EA_ATOMIC[::_1]) if - (TCC_EA_ATOMIC[::_1] != 0) else 0) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_box - - - metric_table: - id: 1809 - title: L2-Fabric Read Stall (Cycles per normUnit) - header: - metric: Channel - ea read stall - pcie: L2-Fabric Read Stall (PCIe) - ea read stall - if: L2-Fabric Read Stall (Infinity Fabric™) - ea read stall - hbm: L2-Fabric Read Stall (HBM) - metric: - "::_1": - ea read stall - pcie: None # Missing perfmon - ea read stall - if: None # Missing perfmon - ea read stall - hbm: None # Missing perfmon - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_multiple_bar - - - metric_table: - id: 1810 - title: L2-Fabric Write and Atomic Stall (Cycles per normUnit) - header: - metric: Channel - ea write stall - pcie: L2-Fabric Write Stall (PCIe) - ea write stall - if: L2-Fabric Write Stall (Infinity Fabric™) - ea write stall - hbm: L2-Fabric Write Stall (HBM) - ea write stall - starve: L2-Fabric Write Starve - metric: - "::_1": - ea write stall - pcie: None # Missing perfmon - ea write stall - if: None # Missing perfmon - ea write stall - hbm: None # Missing perfmon - ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1]) / $denom)) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_multiple_bar - - - metric_table: - id: 1812 - title: L2-Fabric (128B read requests per normUnit) - header: - metric: Channel - expr: Expression - metric: - "::_1": - expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom) - placeholder_range: - "::_1": $total_l2_chan - # tips: Number of 128-byte read requests sent to EA - cli_style: simple_box - tui_style: simple_box diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1800_l2_cache_per_channel.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1800_l2_cache_per_channel.yaml new file mode 100644 index 0000000000..f097a14b55 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1800_l2_cache_per_channel.yaml @@ -0,0 +1,323 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1800 + title: L2 Cache (per Channel) + metrics_description: + L2 Cache Hit Rate: The percent of total number of requests to the L2 from all + clients that hit in the cache. As noted in the Speed-of-Light section, this + includes hit-on-miss requests. + data source: + - metric_table: + id: 1801 + title: Aggregate Stats (All channels) + header: + metric: Metric + avg: Avg + std dev: Std Dev + min: Min + max: Max + unit: Unit + metric: + L2 Cache Hit Rate: + avg: AVG(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 + * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 + * TCC_HIT[15])) + (100 * TCC_HIT[16])) + (100 * TCC_HIT[17])) + (100 * + TCC_HIT[18])) + (100 * TCC_HIT[19])) + (100 * TCC_HIT[20])) + (100 * TCC_HIT[21])) + + (100 * TCC_HIT[22])) + (100 * TCC_HIT[23])) + (100 * TCC_HIT[24])) + + (100 * TCC_HIT[25])) + (100 * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 + * TCC_HIT[28])) + (100 * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * + TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) + + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + (TCC_MISS[18] + + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + TCC_HIT[20])) + + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + (TCC_MISS[23] + + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + TCC_HIT[25])) + + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + (TCC_MISS[28] + + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + TCC_HIT[30])) + + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[29] + TCC_HIT[29])) + (TCC_MISS[30] + + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None)) + std dev: STD(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 + * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 + * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) + (100 * TCC_HIT[17])) + (100 + * TCC_HIT[18])) + (100 * TCC_HIT[19])) + (100 * TCC_HIT[20])) + (100 * + TCC_HIT[21])) + (100 * TCC_HIT[22])) + (100 * TCC_HIT[23])) + (100 * TCC_HIT[24])) + + (100 * TCC_HIT[25])) + (100 * TCC_HIT[26])) + (100 * TCC_HIT[27])) + + (100 * TCC_HIT[28])) + (100 * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 + * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) + + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + (TCC_MISS[18] + + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + TCC_HIT[20])) + + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + (TCC_MISS[23] + + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + TCC_HIT[25])) + + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + (TCC_MISS[28] + + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + TCC_HIT[30])) + + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None)) + min: MIN(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 + * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 + * TCC_HIT[15])) + (100 * TCC_HIT[16])) + (100 * TCC_HIT[17])) + (100 * + TCC_HIT[18])) + (100 * TCC_HIT[19])) + (100 * TCC_HIT[20])) + (100 * TCC_HIT[21])) + + (100 * TCC_HIT[22])) + (100 * TCC_HIT[23])) + (100 * TCC_HIT[24])) + + (100 * TCC_HIT[25])) + (100 * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 + * TCC_HIT[28])) + (100 * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * + TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) + + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + (TCC_MISS[18] + + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + TCC_HIT[20])) + + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + (TCC_MISS[23] + + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + TCC_HIT[25])) + + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + (TCC_MISS[28] + + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + TCC_HIT[30])) + + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None)) + max: MAX(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 + * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 + * TCC_HIT[15])) + (100 * TCC_HIT[16])) + (100 * TCC_HIT[17])) + (100 * + TCC_HIT[18])) + (100 * TCC_HIT[19])) + (100 * TCC_HIT[20])) + (100 * TCC_HIT[21])) + + (100 * TCC_HIT[22])) + (100 * TCC_HIT[23])) + (100 * TCC_HIT[24])) + + (100 * TCC_HIT[25])) + (100 * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 + * TCC_HIT[28])) + (100 * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * + TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) + + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + (TCC_MISS[18] + + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + TCC_HIT[20])) + + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + (TCC_MISS[23] + + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + TCC_HIT[25])) + + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + (TCC_MISS[28] + + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + TCC_HIT[30])) + + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None)) + unit: pct + - metric_table: + id: 1802 + title: L2 Cache Hit Rate (pct) + header: + metric: Channel + expr: Expression + metric: + ::_1: + expr: (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1] + + TCC_MISS[::_1]) != 0) else None) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1803 + title: L2 Requests (per normUnit) + header: + metric: Channel + expr: Expression + metric: + ::_1: + expr: (TO_INT(TCC_REQ[::_1]) / $denom) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1804 + title: L2 Requests (per normUnit) + header: + metric: Channel + read req: L2 Read + write req: L2 Write + atomic req: L2 Atomic + metric: + ::_1: + read req: AVG((TO_INT(TCC_READ[::_1]) / $denom)) + write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom)) + atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom)) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_multiple_bar + tui_style: simple_multiple_bar + - metric_table: + id: 1805 + title: L2-Fabric Requests (per normUnit) + header: + metric: Channel + read req: L2-Fabric Read + write req: L2-Fabric Write and Atomic + atomic req: L2-Fabric Atomic + metric: + ::_1: + read req: AVG((TO_INT(TCC_EA_RDREQ[::_1]) / $denom)) + write req: AVG((TO_INT(TCC_EA_WRREQ[::_1]) / $denom)) + atomic req: AVG((TO_INT(TCC_EA_ATOMIC[::_1]) / $denom)) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_multiple_bar + tui_style: simple_multiple_bar + - metric_table: + id: 1806 + title: L2-Fabric Read Latency (Cycles) + header: + metric: Channel + expr: Expression + metric: + ::_1: + expr: ((TCC_EA_RDREQ_LEVEL[::_1] / TCC_EA_RDREQ[::_1]) if (TCC_EA_RDREQ[::_1] + != 0) else None) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1807 + title: L2-Fabric Write and Atomic Latency (Cycles) + header: + metric: Channel + expr: Expression + metric: + ::_1: + expr: ((TCC_EA_WRREQ_LEVEL[::_1] / TCC_EA_WRREQ[::_1]) if (TCC_EA_WRREQ[::_1] + != 0) else None) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1808 + title: L2-Fabric Atomic Latency (Cycles) + header: + metric: Channel + expr: Expression + metric: + ::_1: + expr: ((TCC_EA_ATOMIC_LEVEL[::_1] / TCC_EA_ATOMIC[::_1]) if (TCC_EA_ATOMIC[::_1] + != 0) else 0) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1809 + title: L2-Fabric Read Stall (Cycles per normUnit) + header: + metric: Channel + ea read stall - pcie: L2-Fabric Read Stall (PCIe) + ea read stall - if: "L2-Fabric Read Stall (Infinity Fabric\u2122)" + ea read stall - hbm: L2-Fabric Read Stall (HBM) + metric: + ::_1: + ea read stall - pcie: None + ea read stall - if: None + ea read stall - hbm: None + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_multiple_bar + tui_style: simple_multiple_bar + - metric_table: + id: 1810 + title: L2-Fabric Write and Atomic Stall (Cycles per normUnit) + header: + metric: Channel + ea write stall - pcie: L2-Fabric Write Stall (PCIe) + ea write stall - if: "L2-Fabric Write Stall (Infinity Fabric\u2122)" + ea write stall - hbm: L2-Fabric Write Stall (HBM) + ea write stall - starve: L2-Fabric Write Starve + metric: + ::_1: + ea write stall - pcie: None + ea write stall - if: None + ea write stall - hbm: None + ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1]) + / $denom)) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_multiple_bar + tui_style: simple_multiple_bar + - metric_table: + id: 1812 + title: L2-Fabric (128B read requests per normUnit) + header: + metric: Channel + expr: Expression + metric: + ::_1: + expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/2100_pc_sampling.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/2100_pc_sampling.yaml index cb4385d4ba..e94471d7dc 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/2100_pc_sampling.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/2100_pc_sampling.yaml @@ -1,10 +1,11 @@ ---- +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py Panel Config: id: 2100 title: PC Sampling + metrics_description: {} data source: - - pc_sampling_table: - id: 2101 - title: PC Sampling - source: None # not support - comparable: false # enable it later + - pc_sampling_table: + id: 2101 + title: PC Sampling + source: ps_file + comparable: false diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0000_top_stats.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0000_top_stats.yaml index ccf1309850..55c6f6bb24 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0000_top_stats.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0000_top_stats.yaml @@ -1,14 +1,14 @@ ---- +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py Panel Config: - id: 000 + id: 0 title: Top Stats + metrics_description: {} data source: - - raw_csv_table: - id: 001 - title: Top Kernels - source: pmc_kernel_top.csv - - - raw_csv_table: - id: 002 - title: Dispatch List - source: pmc_dispatch_info.csv + - raw_csv_table: + id: 1 + title: Top Kernels + source: pmc_kernel_top.csv + - raw_csv_table: + id: 2 + title: Dispatch List + source: pmc_dispatch_info.csv diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0100_system_info.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0100_system_info.yaml index b7ec29eaf9..8470ffbbe3 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0100_system_info.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0100_system_info.yaml @@ -1,9 +1,10 @@ ---- +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py Panel Config: id: 100 title: System Info + metrics_description: {} data source: - - raw_csv_table: - id: 101 - source: sysinfo.csv - columnwise: True + - raw_csv_table: + id: 101 + source: sysinfo.csv + columnwise: true diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0200_system-speed-of-light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0200_system-speed-of-light.yaml deleted file mode 100644 index 8943b5a65e..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0200_system-speed-of-light.yaml +++ /dev/null @@ -1,254 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - SALU: &SALU_anchor Scalar Arithmetic Logic Unit - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 200 - title: System Speed-of-Light - data source: - - metric_table: - id: 201 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - peak: Peak - pop: Pct of Peak - tips: Tips - metric: - VALU FLOPs: - value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) - + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) - + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + (64 * (((SQ_INSTS_VALU_ADD_F64 - + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (2 * SQ_INSTS_VALU_FMA_F64)))) - / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) / (((($max_sclk - * $cu_per_gpu) * 64) * 2) / 1000)) - tips: - VALU IOPs: - value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - Start_Timestamp))) - unit: GIOP/s - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) - tips: - MFMA FLOPs (BF16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 1024) / 1000)) - tips: - MFMA FLOPs (F16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 1024) / 1000)) - tips: - MFMA FLOPs (F32): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - tips: - MFMA FLOPs (F64): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - tips: - MFMA IOPs (Int8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GIOP/s - peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 1024) / 1000)) - tips: - Active CUs: - value: $numActiveCUs - unit: CUs - peak: $cu_per_gpu - pop: ((100 * $numActiveCUs) / $cu_per_gpu) - tips: - SALU Utilization: - value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - peak: 100 - pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - tips: - VALU Utilization: - value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - peak: 100 - pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - tips: - MFMA Utilization: - value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu) - * 4))) - unit: pct - peak: 100 - pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu) - * 4))) - tips: - VMEM Utilization: - value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - peak: 100 - pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - tips: - Branch Utilization: - value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - peak: 100 - pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - tips: - VALU Active Threads: - value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - unit: Threads - peak: 64 - pop: (AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) * 1.5625) - tips: - IPC: - value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - unit: Instr/cycle - peak: 5 - pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5) - tips: - Wavefront Occupancy: - value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - unit: Wavefronts - peak: ($max_waves_per_cu * $cu_per_gpu) - pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu - * $cu_per_gpu)))) - coll_level: SQ_LEVEL_WAVES - tips: - Theoretical LDS Bandwidth: - value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: (($max_sclk * $cu_per_gpu) * 0.128) - pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * 0.00128))) - tips: - LDS Bank Conflicts/Access: - value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - unit: Conflicts/access - peak: 32 - pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) / 32) - tips: - vL1D Cache Hit Rate: - value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - unit: pct - peak: 100 - pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / - TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - tips: - vL1D Cache BW: - value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: ((($max_sclk / 1000) * 64) * $cu_per_gpu) - pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk / 1000) * 64) * $cu_per_gpu)) - tips: - L2 Cache Hit Rate: - value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - unit: pct - peak: 100 - pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - tips: - L2 Cache BW: - value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan)) - pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan))) - tips: - L2-Fabric Read BW: - value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) - * 64)) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: $hbmBandwidth - pop: ((100 * AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) - * 64)) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth) - tips: - L2-Fabric Write BW: - value: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: $hbmBandwidth - pop: ((100 * AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth) - tips: - L2-Fabric Read Latency: - value: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum - != 0) else None)) - unit: Cycles - peak: None - pop: None - tips: - L2-Fabric Write Latency: - value: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum - != 0) else None)) - unit: Cycles - peak: None - pop: None - tips: - sL1D Cache Hit Rate: - value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) - if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) - unit: pct - peak: 100 - pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) - if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) - tips: - sL1D Cache BW: - value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) - unit: GB/s - peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) - pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))) / ((($max_sclk - / 1000) * 64) * $sqc_per_gpu)) - tips: - L1I Hit Rate: - value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) - unit: pct - peak: 100 - pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) - tips: - L1I BW: - value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) - unit: GB/s - peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) - pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))) / ((($max_sclk - / 1000) * 64) * $sqc_per_gpu)) - tips: - L1I Fetch Latency: - value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - unit: Cycles - peak: None - pop: None - coll_level: SQ_IFETCH_LEVEL - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0200_system_speed_of_light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0200_system_speed_of_light.yaml new file mode 100644 index 0000000000..d7020cface --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0200_system_speed_of_light.yaml @@ -0,0 +1,337 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 200 + title: System Speed-of-Light + metrics_description: + VALU FLOPs: 'The total floating-point operations executed per second on the VALU. + This is also presented as a percent of the peak theoretical FLOPs achievable + on the specific accelerator. Note: this does not include any floating-point + operations from MFMA instructions.' + VALU IOPs: 'The total integer operations executed per second on the VALU. This + is also presented as a percent of the peak theoretical IOPs achievable on the + specific accelerator. Note: this does not include any integer operations from + MFMA instructions.' + MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations + executed per second. This does not include any 16-bit brain floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F8 MFMA operations achievable on the specific accelerator. It is supported on + AMD Instinct MI300 series and later only. + MFMA FLOPs (BF16): 'The total number of 16-bit brain floating point MFMA operations + executed per second. Note: this does not include any 16-bit brain floating point + operations from VALU instructions. This is also presented as a percent of the + peak theoretical BF16 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F16): 'The total number of 16-bit floating point MFMA operations executed + per second. Note: this does not include any 16-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F16 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F32): 'The total number of 32-bit floating point MFMA operations executed + per second. Note: this does not include any 32-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F32 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F64): 'The total number of 64-bit floating point MFMA operations executed + per second. Note: this does not include any 64-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F64 MFMA operations achievable on the specific accelerator.' + MFMA IOPs (Int8): 'The total number of 8-bit integer MFMA operations executed + per second. Note: this does not include any 8-bit integer operations from VALU + instructions. This is also presented as a percent of the peak theoretical INT8 + MFMA operations achievable on the specific accelerator.' + Active CUs: Total number of active compute units (CUs) on the accelerator during + the kernel execution. + SALU Utilization: Indicates what percent of the kernel's duration the SALU was + busy executing instructions. Computed as the ratio of the total number of cycles + spent by the scheduler issuing SALU or SMEM instructions over the total CU cycles. + VALU Utilization: Indicates what percent of the kernel's duration the VALU was + busy executing instructions. Does not include VMEM operations. Computed as the + ratio of the total number of cycles spent by the scheduler issuing VALU instructions + over the total CU cycles. + MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit + was busy executing instructions. Computed as the ratio of the total number of + cycles the MFMA was busy over the total CU cycles. + VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit + was busy executing instructions, including both global/generic and spill/scratch + operations (see the VMEM instruction count metrics) for more detail). Does not + include VALU operations. Computed as the ratio of the total number of cycles + spent by the scheduler issuing VMEM instructions over the total CU cycles. + Branch Utilization: Indicates what percent of the kernel's duration the branch + unit was busy executing instructions. Computed as the ratio of the total number + of cycles spent by the scheduler issuing branch instructions over the total + CU cycles + VALU Active Threads: Indicates the average level of divergence within a wavefront + over the lifetime of the kernel. The number of work-items that were active in + a wavefront during execution of each VALU instruction, time-averaged over all + VALU instructions run on all wavefronts in the kernel. + IPC: The ratio of the total number of instructions executed on the CU over the + total active CU cycles. This is also presented as a percent of the peak theoretical + bandwidth achievable on the specific accelerator. + Wavefront Occupancy: 'The time-averaged number of wavefronts resident on the accelerator + over the lifetime of the kernel. Note: this metric may be inaccurate for short-running + kernels (less than 1ms). This is also presented as a percent of the peak theoretical + occupancy achievable on the specific accelerator.' + Theoretical LDS Bandwidth: Indicates the maximum amount of bytes that could have + been loaded from, stored to, or atomically updated in the LDS per unit time + (see LDS Bandwidth example for more detail). This is also presented as a percent + of the peak theoretical F64 MFMA operations achievable on the specific accelerator. + LDS Bank Conflicts/Access: The ratio of the number of cycles spent in the LDS + scheduler due to bank conflicts (as determined by the conflict resolution hardware) + to the base number of cycles that would be spent in the LDS scheduler in a completely + uncontended case. This is also presented in normalized form (i.e., the Bank + Conflict Rate). + vL1D Cache Hit Rate: The ratio of the number of vL1D cache line requests that + hit in vL1D cache over the total number of cache line requests to the vL1D cache + RAM. + vL1D Cache BW: The number of bytes looked up in the vL1D cache as a result of + VMEM instructions per unit time. The number of bytes is calculated as the number + of cache lines requested multiplied by the cache line size. This value does + not consider partial requests, so e.g., if only a single value is requested + in a cache line, the data movement will still be counted as a full cache line. + This is also presented as a percent of the peak theoretical bandwidth achievable + on the specific accelerator. + L2 Cache Hit Rate: The ratio of the number of L2 cache line requests that hit + in the L2 cache over the total number of incoming cache line requests to the + L2 cache. + L2 Cache BW: The number of bytes looked up in the L2 cache per unit time. The + number of bytes is calculated as the number of cache lines requested multiplied + by the cache line size. This value does not consider partial requests, so e.g., + if only a single value is requested in a cache line, the data movement will + still be counted as a full cache line. This is also presented as a percent of + the peak theoretical bandwidth achievable on the specific accelerator. + L2-Fabric Read BW: "The number of bytes read by the L2 over the Infinity Fabric\u2122\ + \ interface per unit time. This is also presented as a percent of the peak theoretical\ + \ bandwidth achievable on the specific accelerator." + L2-Fabric Write BW: The number of bytes sent by the L2 over the Infinity Fabric + interface by write and atomic operations per unit time. This is also presented + as a percent of the peak theoretical bandwidth achievable on the specific accelerator. + L2-Fabric Read Latency: The time-averaged number of cycles read requests spent + in Infinity Fabric before data was returned to the L2. + L2-Fabric Write Latency: The time-averaged number of cycles write requests spent + in Infinity Fabric before a completion acknowledgement was returned to the L2. + sL1D Cache Hit Rate: The percent of sL1D requests that hit on a previously loaded + line the cache. Calculated as the ratio of the number of sL1D requests that + hit over the number of all sL1D requests. + sL1D Cache BW: The number of bytes looked up in the sL1D cache per unit time. + This is also presented as a percent of the peak theoretical bandwidth achievable + on the specific accelerator. + L1I Hit Rate: The number of bytes looked up in the L1I cache per unit time. This + is also presented as a percent of the peak theoretical bandwidth achievable + on the specific accelerator. + L1I BW: The percent of L1I requests that hit on a previously loaded line the cache. + Calculated as the ratio of the number of L1I requests that hit over the number + of all L1I requests. + L1I Fetch Latency: The average number of cycles spent to fetch instructions to + a CU. + data source: + - metric_table: + id: 201 + title: System Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + peak: Peak + pop: Pct of Peak + metric: + VALU FLOPs: + value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) + / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) + VALU IOPs: + value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp + - Start_Timestamp))) + unit: GIOP/s + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp + - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) + MFMA FLOPs (BF16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000)) + MFMA FLOPs (F16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000)) + MFMA FLOPs (F32): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + MFMA FLOPs (F64): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + MFMA IOPs (Int8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GIOP/s + peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000)) + Active CUs: + value: $numActiveCUs + unit: CUs + peak: $cu_per_gpu + pop: ((100 * $numActiveCUs) / $cu_per_gpu) + SALU Utilization: + value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + VALU Utilization: + value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + MFMA Utilization: + value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu) * 4))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu) * 4))) + VMEM Utilization: + value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + unit: pct + peak: 100 + pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + Branch Utilization: + value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + peak: 100 + pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + VALU Active Threads: + value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + unit: Threads + peak: 64 + pop: (AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) * 1.5625) + IPC: + value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + unit: Instr/cycle + peak: 5 + pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5) + Wavefront Occupancy: + value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + unit: Wavefronts + peak: ($max_waves_per_cu * $cu_per_gpu) + pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu + * $cu_per_gpu)))) + coll_level: SQ_LEVEL_WAVES + Theoretical LDS Bandwidth: + value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: (($max_sclk * $cu_per_gpu) * 0.128) + pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * 0.00128))) + LDS Bank Conflicts/Access: + value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + unit: Conflicts/access + peak: 32 + pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) / 32) + vL1D Cache Hit Rate: + value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None)) + unit: pct + peak: 100 + pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None)) + vL1D Cache BW: + value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: ((($max_sclk / 1000) * 64) * $cu_per_gpu) + pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk / 1000) * 64) * $cu_per_gpu)) + L2 Cache Hit Rate: + value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + unit: pct + peak: 100 + pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + L2 Cache BW: + value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan)) + pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) + / ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan))) + L2-Fabric Read BW: + value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) + * 64)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: $hbmBandwidth + pop: ((100 * AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) + * 64)) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth) + L2-Fabric Write BW: + value: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) + * 32)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: $hbmBandwidth + pop: ((100 * AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) + * 32)) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth) + L2-Fabric Read Latency: + value: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + unit: Cycles + peak: None + pop: None + L2-Fabric Write Latency: + value: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + unit: Cycles + peak: None + pop: None + sL1D Cache Hit Rate: + value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) + if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) + unit: pct + peak: 100 + pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) + if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) + sL1D Cache BW: + value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) + unit: GB/s + peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) + pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * + 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) + L1I Hit Rate: + value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) + unit: pct + peak: 100 + pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) + L1I BW: + value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) + unit: GB/s + peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) + pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * + 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) + L1I Fetch Latency: + value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + unit: Cycles + peak: None + pop: None + coll_level: SQ_IFETCH_LEVEL diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0300_mem_chart.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0300_mem_chart.yaml deleted file mode 100644 index d40b13a94b..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0300_mem_chart.yaml +++ /dev/null @@ -1,315 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 300 - title: Memory Chart - data source: - - metric_table: - id: 301 - title: Memory Chart - header: - metric: Metric - #alias: #alias - value: Value - tips: Tips - metric: - # ---------------------------------------- - # Instr Buff Block - - #TODO: double check wave_occupancy - Wavefront Occupancy: - #alias: wave_occ_ - value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs), 0) - coll_level: SQ_LEVEL_WAVES - tips: - Wave Life: - #alias: wave_life_ - value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0) else 0)), 0) - tips: - - # ---------------------------------------- - # Instr Dispatch Block - SALU: - #alias: salu_ - value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0) - tips: - SMEM: - #alias: smem_ - value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0) - tips: - VALU: - #alias: valu_ - value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0) - tips: - MFMA: - #alias: mfma_ - value: ROUND(AVG((SQ_INSTS_MFMA / $denom)), 0) - tips: - VMEM: - #alias: vmem_ - value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0) - tips: - LDS: - #alias: lds_ - value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) - tips: - GWS: - #alias: gws_ - value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0) - tips: - BR: - #alias: br_ - value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0) - tips: - - # ---------------------------------------- - # Exec Block - Active CUs: - #alias: active_cu_ - value: $numActiveCUs - tips: - Num CUs: - #alias: num_cu_ - value: $cu_per_gpu - tips: - VGPR: - #alias: vgpr_ - value: ROUND(AVG(Arch_VGPR), 0) - tips: - # Todo: add AGPRs - SGPR: - #alias: sgpr_ - value: ROUND(AVG(SGPR), 0) - tips: - LDS Allocation: - #alias: lds_alloc_ - value: ROUND(AVG(LDS_Per_Workgroup), 0) - tips: - Scratch Allocation: - #alias: scratch_alloc_ - value: ROUND(AVG(Scratch_Per_Workitem), 0) - tips: - Wavefronts: - #alias: wavefronts_ - value: ROUND(AVG(SPI_CSN_WAVE), 0) - tips: - Workgroups: - #alias: workgroups_ - value: ROUND(AVG(SPI_CSN_NUM_THREADGROUPS), 0) - tips: - - # ---------------------------------------- - # LDS Block - LDS Req: - #alias: lds_req_ - value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) - tips: - LDS Util: - #alias: lds_util_ - value: - ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))), - 0) - tips: - LDS Latency: - #alias: lds_lat - value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)),0) - coll_level: SQ_INST_LEVEL_LDS - tips: - - # ---------------------------------------- - # Vector L1 Cache Block - VL1 Rd: - #alias: vl1_rd_ - value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0) - tips: - VL1 Wr: - #alias: vl1_wr_ - value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0) - tips: - VL1 Atomic: - #alias: vl1_atom_ - value: - ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)), 0) - tips: - - VL1 Hit: - #alias: vl1_hit_ - value: - ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None )), 0) - tips: - VL1 Lat: - #alias: vl1_lat_ - value: - ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum - != 0) else None)), 0) - tips: - VL1 Coalesce: - #alias: vl1_coales_ - value: - ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum - * 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0) - tips: - VL1 Stall: - #alias: vl1_stall_ - value: - ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None)), 0) - tips: - - VL1_L2 Rd: - #alias: vl1_l2_rd_ - value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0) - tips: - VL1_L2 Wr: - #alias: vl1_l2_wr_ - value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0) - tips: - VL1_L2 Atomic: - #alias: vl1_l2_atom_ - value: - ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)), 0) - tips: - - # ---------------------------------------- - # Scalar L1D Cache Block - VL1D Rd: - #alias: sl1_rd_ - value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0) - tips: - VL1D Hit: - #alias: sl1_hit_ - value: - ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ != - 0) else None)) * 100), 0) - tips: - VL1D Lat: - #alias: sl1_lat_ - value: - ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ != - 0) else None)) * 100), 0) - coll_level: SQC_DCACHE_INFLIGHT_LEVEL - tips: - - VL1D_L2 Rd: - #alias: sl1_l2_rd_ - value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0) - tips: - VL1D_L2 Wr: - #alias: sl1_l2_wr_ - value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0) - tips: - VL1D_L2 Atomic: - #alias: sl1_l2_atom_ - value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0) - tips: - - # ---------------------------------------- - # Instr L1 Cache Block - IL1 Fetch: - #alias: il1_fetch_ - value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0) - tips: - IL1 Hit: - #alias: il1_hit_ - value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0) - tips: - IL1 Lat: - #alias: il1_lat_ - value: - ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ != - 0) else None)) * 100), 0) - tips: # ??? coll_level: SQ_IFETCH_LEVEL - IL1_L2 Rd: - #alias: il1_l2_req_ - value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0) - tips: - - # ---------------------------------------- - # L2 Cache Block(inside) - L2 Rd: - #alias: l2_rd_ - value: ROUND(AVG((TCC_READ_sum / $denom)), 0) - tips: - L2 Wr: - #alias: l2_wr_ - value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0) - tips: - L2 Atomic: - #alias: l2_atom_ - value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0) - tips: - L2 Hit: - #alias: l2_hit_ - value: - ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else 0)), 0) - tips: - L2 Rd Lat: - #alias: l2_rd_lat_ - value: - ROUND(AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) - if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else None)), - 0) - tips: - L2 Wr Lat: - #alias: l2_wr_lat_ - value: - ROUND(AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + - TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - != 0) else None)), 0) - tips: - - # ---------------------------------------- - # Fabric Block - Fabric_L2 Rd: - #alias: l2_fabric_rd_ - value: ROUND(AVG((TCC_EA_RDREQ_sum / $denom)), 0) - tips: - Fabric_L2 Wr: - #alias: l2_fabric_wr_ - value: ROUND(AVG((TCC_EA_WRREQ_sum / $denom)), 0) - tips: - Fabric_L2 Atomic: - #alias: l2_fabric_atom_ - value: ROUND(AVG((TCC_EA_ATOMIC_sum / $denom)), 0) - tips: - - Fabric Rd Lat: - #alias: fabric_rd_lat_ - value: - ROUND(AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum - != 0) else 0)), 0) - tips: - Fabric Wr Lat: - #alias: fabric_wr_lat_ - value: - ROUND(AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum - != 0) else 0)), 0) - tips: - Fabric Atomic Lat: - #alias: fabric_atom_lat_ - value: - ROUND(AVG(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum - != 0) else 0)), 0) - tips: - - HBM Rd: - #alias: hbm_rd_ - value: ROUND(AVG((TCC_EA_RDREQ_DRAM_sum / $denom)), 0) - tips: - HBM Wr: - #alias: hbm_wr_ - value: ROUND(AVG((TCC_EA_WRREQ_DRAM_sum / $denom)), 0) - tips: - - comparable: false # for now - cli_style: mem_chart diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0300_memory_chart.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0300_memory_chart.yaml new file mode 100644 index 0000000000..6fb757becb --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0300_memory_chart.yaml @@ -0,0 +1,267 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 300 + title: Memory Chart + metrics_description: + Wavefront Occupancy: Wavefronts per active CU. + Wave Life: Average number of cycles executing a wave. + SALU: Total Number of SALU (Scalar ALU) instructions issued per normalization + unit. + SMEM: Total number of SMEM (Scalar Memory Read) instructions issued normalization + unit. + VALU: The number of VALU (Vector ALU) instructions issued per normalization unit. + MFMA: Total number of MFMA (Matrix-Fused-Multiply-Add) instructions issued per + normalization unit. + VMEM: The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch + memory) per normalization unit. + LDS: The total number of LDS instructions (including, but not limited to, read/write/atomics + and HIP's __shfl instructions) executed per normalization unit. + GWS: Total number of GDS (global data sync) instructions issued per normalization + unit. + BR: Total number of BRANCH instructions issued per normalization unit. + Active CUs: Total number of active compute units (CUs) on the accelerator during + the kernel execution. + Num CUs: Total number of compute units (CUs) on the accelerator. + VGPR: 'The number of architected vector general-purpose registers allocated for + the kernel, see VALU. Note: this may not exactly match the number of VGPRs requested + by the compiler due to allocation granularity.' + SGPR: 'The number of scalar general-purpose registers allocated for the kernel, + see SALU. Note: this may not exactly match the number of SGPRs requested by + the compiler due to allocation granularity.' + LDS Allocation: 'The number of bytes of LDS memory (or, shared memory) allocated + for this kernel. Note: This may also be larger than what was requested at compile + time due to both allocation granularity and dynamic per-dispatch LDS allocations.' + Scratch Allocation: The number of bytes of scratch memory requested per work-item + for this kernel. Scratch memory is used for stack memory on the accelerator, + as well as for register spills and restores. + Wavefronts: The total number of wavefronts, summed over all workgroups, forming + this kernel launch. + Workgroups: The total number of workgroups forming this kernel launch. + LDS Req: The total number of LDS instructions (including, but not limited to, + read/write/atomics and HIP's __shfl instructions) executed per normalization + unit. + LDS Util: Indicates what percent of the kernel's duration the LDS was actively + executing instructions (including, but not limited to, load, store, atomic and + HIP's __shfl operations). Calculated as the ratio of the total number of cycles + LDS was active over the total CU cycles. + LDS Latency: The average number of round-trip cycles (i.e., from issue to data-return + / acknowledgment) required for an LDS instruction to complete. + VL1 Rd: The total number of incoming read requests from the address processing + unit after coalescing per normalization unit + VL1 Wr: The total number of incoming write requests from the address processing + unit after coalescing per normalization unit + VL1 Atomic: The total number of incoming atomic requests from the address processing + unit after coalescing per normalization unit + VL1 Hit: The ratio of the number of vL1D cache line requests that hit in vL1D + cache over the total number of cache line requests to the vL1D Cache RAM. + VL1 Lat: Calculated as the average number of cycles that a vL1D cache line request + spent in the vL1D cache pipeline. + VL1 Coalesce: Indicates how well memory instructions were coalesced by the address + processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). Calculated + as the average number of thread-requests generated per instruction divided by + the ideal number of thread-requests per instruction. + VL1 Stall: The ratio of the number of cycles where the vL1D is stalled waiting + to issue a request for data to the L2 cache divided by the number of cycles + where the vL1D is active. + VL1_L2 Rd: The number of read requests for a vL1D cache line that were not satisfied + by the vL1D and must be retrieved from the to the L2 Cache per normalization + unit. + VL1_L2 Wr: The number of write requests to a vL1D cache line that were sent through + the vL1D to the L2 cache, per normalization unit. + VL1_L2 Atomic: The number of atomic requests that are sent through the vL1D to + the L2 cache, per normalization unit. This includes requests for atomics with, + and without return. + sL1D Rd: The total number of requests, of any size or type, made to the sL1D per + normalization unit. + sL1D Hit: The total number of sL1D requests that hit on a previously loaded cache + line, per normalization unit. + sL1D_L2 Rd: The total number of read requests from sL1D to the L2, per normalization + unit. + sL1D_L2 Wr: The total number of write requests from sL1D to the L2, per normalization + unit. Typically unused on current CDNA accelerators. + sL1D_L2 Atomic: The total number of atomic requests from sL1D to the L2, per normalization + unit. Typically unused on current CDNA accelerators. + IL1 Fetch: The total number of requests made to the L1I per normalization-unit. + IL1 Hit: The percent of L1I requests that hit on a previously loaded line the + cache. Calculated as the ratio of the number of L1I requests that hit over the + number of all L1I requests. + IL1 Lat: The average number of cycles spent to fetch instructions to a CU. + IL1_L2 Rd: The total number of requests across the L1I - L2 interface per normalization-unit. + L2 Rd: The total number of read requests to the L2 from all clients. + L2 Wr: The total number of write requests to the L2 from all clients. + L2 Atomic: The total number of atomic requests (with and without return) to the + L2 from all clients. + L2 Hit: The ratio of the number of L2 cache line requests that hit in the L2 cache + over the total number of incoming cache line requests to the L2 cache. + L2 Rd Lat: Calculated as the average number of cycles that the vL1D cache took + to issue and receive read requests from the L2 Cache. This number also includes + requests for atomics with return values. + L2 Wr Lat: Calculated as the average number of cycles that the vL1D cache took + to issue and receive acknowledgement of a write request to the L2 Cache. This + number also includes requests for atomics without return values. + Fabric_L2 Rd: Number of L2 cache - Infinity Fabric read requests (either 32-byte + or 64-byte) summed over TCC instances per normalization unit. + Fabric_L2 Wr: Number of L2 cache - Infinity Fabric write requests (either 32-byte + or 64-byte) summed over TCC instances per normalization unit. + Fabric_L2 Atomic: Number of L2 cache - Infinity Fabric write requests (either + 32-byte or 64-byte) that are actually atomic requests summed over TCC instances + per normalization unit. + Fabric Rd Lat: The time-averaged number of cycles read requests spent in Infinity + Fabric before data was returned to the L2. + Fabric Wr Lat: The time-averaged number of cycles write requests spent in Infinity + Fabric before a completion acknowledgement was returned to the L2. + Fabric Atomic Lat: The time-averaged number of cycles atomic requests spent in + Infinity Fabric before a completion acknowledgement (atomic without return value) + or data (atomic with return value) was returned to the L2. + HBM Rd: The total number of L2 requests to Infinity Fabric to read 32B or 64B + of data from the accelerator's local HBM, per normalization unit. + HBM Wr: 'The total number of L2 requests to Infinity Fabric to write or atomically + update 32B or 64B of data in the accelerator''s local HBM, per normalization + unit. ' + data source: + - metric_table: + id: 301 + title: Memory Chart + header: + metric: Metric + value: Value + metric: + Wavefront Occupancy: + value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs), + 0) + coll_level: SQ_LEVEL_WAVES + Wave Life: + value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0) else + 0)), 0) + SALU: + value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0) + SMEM: + value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0) + VALU: + value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0) + MFMA: + value: ROUND(AVG((SQ_INSTS_MFMA / $denom)), 0) + VMEM: + value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0) + LDS: + value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) + GWS: + value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0) + BR: + value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0) + Active CUs: + value: $numActiveCUs + Num CUs: + value: $cu_per_gpu + VGPR: + value: ROUND(AVG(Arch_VGPR), 0) + SGPR: + value: ROUND(AVG(SGPR), 0) + LDS Allocation: + value: ROUND(AVG(LDS_Per_Workgroup), 0) + Scratch Allocation: + value: ROUND(AVG(Scratch_Per_Workitem), 0) + Wavefronts: + value: ROUND(AVG(SPI_CSN_WAVE), 0) + Workgroups: + value: ROUND(AVG(SPI_CSN_NUM_THREADGROUPS), 0) + LDS Req: + value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) + LDS Util: + value: ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))), 0) + LDS Latency: + value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS + != 0) else None)),0) + coll_level: SQ_INST_LEVEL_LDS + VL1 Rd: + value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0) + VL1 Wr: + value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0) + VL1 Atomic: + value: ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)), 0) + VL1 Hit: + value: ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None )), 0) + VL1 Lat: + value: ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum + != 0) else None)), 0) + VL1 Coalesce: + value: ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum + * 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0) + VL1 Stall: + value: ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None)), 0) + VL1_L2 Rd: + value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0) + VL1_L2 Wr: + value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0) + VL1_L2 Atomic: + value: ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)), 0) + sL1D Rd: + value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0) + sL1D Hit: + value: ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ + != 0) else None)) * 100), 0) + sL1D Lat: + value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ + != 0) else None)) * 100), 0) + coll_level: SQC_DCACHE_INFLIGHT_LEVEL + sL1D_L2 Rd: + value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0) + sL1D_L2 Wr: + value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0) + sL1D_L2 Atomic: + value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0) + IL1 Fetch: + value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0) + IL1 Hit: + value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0) + IL1 Lat: + value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ + != 0) else None)) * 100), 0) + IL1_L2 Rd: + value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0) + L2 Rd: + value: ROUND(AVG((TCC_READ_sum / $denom)), 0) + L2 Wr: + value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0) + L2 Atomic: + value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0) + L2 Hit: + value: ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if + ((TCC_HIT_sum + TCC_MISS_sum) != 0) else 0)), 0) + L2 Rd Lat: + value: ROUND(AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + != 0) else None)), 0) + L2 Wr Lat: + value: ROUND(AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + != 0) else None)), 0) + Fabric_L2 Rd: + value: ROUND(AVG((TCC_EA_RDREQ_sum / $denom)), 0) + Fabric_L2 Wr: + value: ROUND(AVG((TCC_EA_WRREQ_sum / $denom)), 0) + Fabric_L2 Atomic: + value: ROUND(AVG((TCC_EA_ATOMIC_sum / $denom)), 0) + Fabric Rd Lat: + value: ROUND(AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else 0)), 0) + Fabric Wr Lat: + value: ROUND(AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else 0)), 0) + Fabric Atomic Lat: + value: ROUND(AVG(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum + != 0) else 0)), 0) + HBM Rd: + value: ROUND(AVG((TCC_EA_RDREQ_DRAM_sum / $denom)), 0) + HBM Wr: + value: ROUND(AVG((TCC_EA_WRREQ_DRAM_sum / $denom)), 0) + comparable: false + cli_style: mem_chart + tui_style: mem_chart diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0400_roofline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0400_roofline.yaml new file mode 100644 index 0000000000..41c8bac547 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0400_roofline.yaml @@ -0,0 +1,9 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 400 + title: Roofline + metrics_description: {} + data source: + - None: + id: 401 + title: Roofline diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0400_roofline_info.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0400_roofline_info.yaml deleted file mode 100644 index 1474b85cf2..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0400_roofline_info.yaml +++ /dev/null @@ -1,8 +0,0 @@ ---- -Panel Config: - id: 400 - title: Roofline - data source: - - None: - id: 401 - title: Roofline \ No newline at end of file diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0500_command-processor.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0500_command-processor.yaml deleted file mode 100644 index 164b3552bf..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0500_command-processor.yaml +++ /dev/null @@ -1,135 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 500 - title: Command Processor (CPC/CPF) - data source: - - metric_table: - id: 501 - title: Command Processor Fetcher - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - CPF Utilization: - avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) - if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) - min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) - if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) - max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) - if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) - unit: pct - tips: - CPF Stall: - avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None)) - min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None)) - max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None)) - unit: pct - tips: - CPF-L2 Utilization: - avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) - if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) - min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) - if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) - max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) - if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) - unit: pct - tips: - CPF-L2 Stall: - avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY - != 0) else None)) - min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY - != 0) else None)) - max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY - != 0) else None)) - unit: pct - tips: - CPF-UTCL1 Stall: - avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None) - min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None) - max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None) - unit: pct - tips: - - - metric_table: - id: 502 - title: Packet Processor - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - CPC Utilization: - avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) - if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) - min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) - if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) - max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) - if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) - unit: pct - tips: - CPC Stall Rate: - avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - unit: pct - tips: - CPC Packet Decoding Utilization: - avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) - min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) - max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) - unit: pct - tips: - CPC-Workgroup Manager Utilization: - avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) - min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) - max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) - unit: Pct - tips: - CPC-L2 Utilization: - avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) - if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) - min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) - if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) - max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) - if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) - unit: pct - tips: - CPC-UTCL1 Stall: - avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None) - min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None) - max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None) - unit: pct - tips: - CPC-UTCL2 Utilization: - avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) - if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) - min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) - if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) - max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) - if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) - unit: pct - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0500_command_processor_cpc_cpf.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0500_command_processor_cpc_cpf.yaml new file mode 100644 index 0000000000..c4d2cabf52 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0500_command_processor_cpc_cpf.yaml @@ -0,0 +1,145 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 500 + title: Command Processor (CPC/CPF) + metrics_description: + CPF Utilization: Percent of total cycles where the CPF was busy actively doing + any work. The ratio of CPF busy cycles over total cycles counted by the CPF. + CPF Stall: Percent of CPF busy cycles where the CPF was stalled for any reason. + CPF-L2 Utilization: Percent of total cycles counted by the CPF-L2 interface where + the CPF-L2 interface was active doing any work. The ratio of CPF-L2 busy cycles + over total cycles counted by the CPF-L2. + CPF-L2 Stall: Percent of CPF-L2 L2 busy cycles where the CPF-L2 interface was + stalled for any reason. + CPF-UTCL1 Stall: Percent of CPF busy cycles where the CPF was stalled by address + translation. + CPC Utilization: Percent of total cycles where the CPC was busy actively doing + any work. The ratio of CPC busy cycles over total cycles counted by the CPC. + CPC Stall Rate: Percent of CPC busy cycles where the CPC was stalled for any reason. + CPC Packet Decoding Utilization: Percent of CPC busy cycles spent decoding commands + for processing. + CPC-Workgroup Manager Utilization: Percent of CPC busy cycles spent dispatching + workgroups to the workgroup manager. + CPC-L2 Utilization: Percent of total cycles counted by the CPC-L2 interface where + the CPC-L2 interface was active doing any work. + CPC-UTCL1 Stall: Percent of CPC busy cycles where the CPC was stalled by address + translation + CPC-UTCL2 Utilization: 'Percent of total cycles counted by the CPC''s L2 address + translation interface where the CPC was busy doing address translation work. ' + data source: + - metric_table: + id: 501 + title: Command processor fetcher (CPF) + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + CPF Utilization: + avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) + if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) + min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) + if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) + max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) + if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) + unit: pct + CPF Stall: + avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None)) + min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None)) + max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None)) + unit: pct + CPF-L2 Utilization: + avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) + if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) + min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) + if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) + max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) + if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) + unit: pct + CPF-L2 Stall: + avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY + != 0) else None)) + min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY + != 0) else None)) + max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY + != 0) else None)) + unit: pct + CPF-UTCL1 Stall: + avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) + if (CPF_CPF_STAT_BUSY != 0) else None) + min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) + if (CPF_CPF_STAT_BUSY != 0) else None) + max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) + if (CPF_CPF_STAT_BUSY != 0) else None) + unit: pct + - metric_table: + id: 502 + title: Command processor packet processor (CPC) + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + CPC Utilization: + avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) + if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) + min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) + if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) + max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) + if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) + unit: pct + CPC Stall Rate: + avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + unit: pct + CPC Packet Decoding Utilization: + avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + unit: pct + CPC-Workgroup Manager Utilization: + avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + unit: Pct + CPC-L2 Utilization: + avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) + if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) + min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) + if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) + max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) + if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) + unit: pct + CPC-UTCL1 Stall: + avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if + (CPC_CPC_STAT_BUSY != 0) else None) + min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if + (CPC_CPC_STAT_BUSY != 0) else None) + max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if + (CPC_CPC_STAT_BUSY != 0) else None) + unit: pct + CPC-UTCL2 Utilization: + avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) + if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) + min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) + if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) + max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) + if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) + unit: pct diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0600_shader-processor-input.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0600_shader-processor-input.yaml deleted file mode 100644 index c78c3645a0..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0600_shader-processor-input.yaml +++ /dev/null @@ -1,167 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 600 - title: Workgroup Manager (SPI) - data source: - - metric_table: - id: 601 - title: Workgroup Manager Utilizations - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Accelerator Utilization: - avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) - min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) - max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) - unit: Pct - tips: - Scheduler-Pipe Utilization: - avg: AVG(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - min: MIN(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - max: MAX(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - unit: Pct - tips: - Workgroup Manager Utilization: - avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) - min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) - max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) - unit: Pct - tips: - Shader Engine Utilization: - avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) - min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) - max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) - unit: Pct - tips: - SIMD Utilization: - avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Dispatched Workgroups: - avg: AVG(SPI_CSN_NUM_THREADGROUPS) - min: MIN(SPI_CSN_NUM_THREADGROUPS) - max: MAX(SPI_CSN_NUM_THREADGROUPS) - unit: Workgroups - tips: - Dispatched Wavefronts: - avg: AVG(SPI_CSN_WAVE) - min: MIN(SPI_CSN_WAVE) - max: MAX(SPI_CSN_WAVE) - unit: Wavefronts - tips: - VGPR Writes: - avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - unit: Cycles/wave - tips: - SGPR Writes: - avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - unit: Cycles/wave - tips: - - metric_table: - id: 602 - title: Workgroup Manager - Resource Allocation - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Not-scheduled Rate (Workgroup Manager): - avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None) - min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None) - max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None) - unit: Pct - tips: - Not-scheduled Rate (Scheduler-Pipe): - avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None) - min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None) - max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None) - unit: Pct - tips: - Scheduler-Pipe Stall Rate: - avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None)) - min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None)) - max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None)) - unit: Pct - tips: - Scratch Stall Rate: - avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - unit: Pct - tips: - Insufficient SIMD Waveslots: - avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Insufficient SIMD VGPRs: - avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Insufficient SIMD SGPRs: - avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Insufficient CU LDS: - avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Insufficient CU Barriers: - avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Reached CU Workgroup Limit: - avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Reached CU Wavefront Limit: - avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0600_workgroup_manager_spi.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0600_workgroup_manager_spi.yaml new file mode 100644 index 0000000000..f6bf13d8b8 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0600_workgroup_manager_spi.yaml @@ -0,0 +1,201 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 600 + title: Workgroup Manager (SPI) + metrics_description: + Accelerator Utilization: The percent of cycles in the kernel where the accelerator + was actively doing any work. + Scheduler-Pipe Utilization: The percent of total scheduler-pipe cycles in the + kernel where the scheduler-pipes were actively doing any work. + Workgroup Manager Utilization: The percent of cycles in the kernel where the workgroup + manager was actively doing any work. + Shader Engine Utilization: The percent of total shader engine cycles in the kernel + where any CU in a shader-engine was actively doing any work, normalized over + all shader-engines. Low values (e.g., << 100%) indicate that the accelerator + was not fully saturated by the kernel, or a potential load-imbalance issue. + SIMD Utilization: The percent of total SIMD cycles in the kernel where any SIMD + on a CU was actively doing any work, summed over all CUs. Low values (less than + 100%) indicate that the accelerator was not fully saturated by the kernel, or + a potential load-imbalance issue. + Dispatched Workgroups: The total number of workgroups forming this kernel launch. + Dispatched Wavefronts: The total number of wavefronts, summed over all workgroups, + forming this kernel launch. + VGPR Writes: The average number of cycles spent initializing VGPRs at wave creation. + SGPR Writes: The average number of cycles spent initializing SGPRs at wave creation. + Not-scheduled Rate (Workgroup Manager): The percent of total scheduler-pipe cycles + in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck + within the workgroup manager rather than a lack of a CU or SIMD with sufficient + resources. + Not-scheduled Rate (Scheduler-Pipe): 'The percent of total scheduler-pipe cycles + in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck + within the scheduler-pipes rather than a lack of a CU or SIMD with sufficient + resources. ' + Scheduler-Pipe Stall Rate: The percent of total scheduler-pipe cycles in the kernel + where a workgroup could not be scheduled to a CU due to occupancy limitations + (like a lack of a CU or SIMD with sufficient resources). + Scratch Stall Rate: The percent of total shader-engine cycles in the kernel where + a workgroup could not be scheduled to a CU due to lack of private (a.k.a., scratch) + memory slots. While this can reach up to 100%, note that the actual occupancy + limitations on a kernel using private memory are typically quite small (for + example, less than 1% of the total number of waves that can be scheduled to + an accelerator). + Insufficient SIMD Waveslots: The percent of total SIMD cycles in the kernel where + a workgroup could not be scheduled to a SIMD due to lack of available waveslots. + Insufficient SIMD VGPRs: The percent of total SIMD cycles in the kernel where + a workgroup could not be scheduled to a SIMD due to lack of available VGPRs. + Insufficient SIMD SGPRs: The percent of total SIMD cycles in the kernel where + a workgroup could not be scheduled to a SIMD due to lack of available SGPRs. + Insufficient CU LDS: The percent of total CU cycles in the kernel where a workgroup + could not be scheduled to a CU due to lack of available LDS. + Insufficient CU Barriers: The percent of total CU cycles in the kernel where a + workgroup could not be scheduled to a CU due to lack of available barriers. + Reached CU Workgroup Limit: The percent of total CU cycles in the kernel where + a workgroup could not be scheduled to a CU due to limits within the workgroup + manager. This is expected to be always be zero on CDNA2 or newer accelerators + (and small for previous accelerators). + Reached CU Wavefront Limit: The percent of total CU cycles in the kernel where + a wavefront could not be scheduled to a CU due to limits within the workgroup + manager. This is expected to be always be zero on CDNA2 or newer accelerators + (and small for previous accelerators). + data source: + - metric_table: + id: 601 + title: Workgroup manager utilizations + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Accelerator Utilization: + avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) + min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) + max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) + unit: Pct + Scheduler-Pipe Utilization: + avg: AVG(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu + * $se_per_gpu)) + min: MIN(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu + * $se_per_gpu)) + max: MAX(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu + * $se_per_gpu)) + unit: Pct + Workgroup Manager Utilization: + avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) + min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) + max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) + unit: Pct + Shader Engine Utilization: + avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) + min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) + max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) + unit: Pct + SIMD Utilization: + avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Dispatched Workgroups: + avg: AVG(SPI_CSN_NUM_THREADGROUPS) + min: MIN(SPI_CSN_NUM_THREADGROUPS) + max: MAX(SPI_CSN_NUM_THREADGROUPS) + unit: Workgroups + Dispatched Wavefronts: + avg: AVG(SPI_CSN_WAVE) + min: MIN(SPI_CSN_WAVE) + max: MAX(SPI_CSN_WAVE) + unit: Wavefronts + VGPR Writes: + avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + unit: Cycles/wave + SGPR Writes: + avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + unit: Cycles/wave + - metric_table: + id: 602 + title: Workgroup Manager - Resource Allocation + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Not-scheduled Rate (Workgroup Manager): + avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + unit: Pct + Not-scheduled Rate (Scheduler-Pipe): + avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + unit: Pct + Scheduler-Pipe Stall Rate: + avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) + min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) + max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) + unit: Pct + Scratch Stall Rate: + avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + unit: Pct + Insufficient SIMD Waveslots: + avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Insufficient SIMD VGPRs: + avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Insufficient SIMD SGPRs: + avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Insufficient CU LDS: + avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Insufficient CU Barriers: + avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Reached CU Workgroup Limit: + avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Reached CU Wavefront Limit: + avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0700_wavefront-launch.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0700_wavefront-launch.yaml deleted file mode 100644 index cc650e9bc0..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0700_wavefront-launch.yaml +++ /dev/null @@ -1,142 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 700 - title: Wavefront - data source: - - metric_table: - id: 701 - title: Wavefront Launch Stats - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Grid Size: - avg: AVG(Grid_Size) - min: MIN(Grid_Size) - max: MAX(Grid_Size) - unit: Work Items - tips: - Workgroup Size: - avg: AVG(Workgroup_Size) - min: MIN(Workgroup_Size) - max: MAX(Workgroup_Size) - unit: Work Items - tips: - Total Wavefronts: - avg: AVG(SPI_CSN_WAVE) - min: MIN(SPI_CSN_WAVE) - max: MAX(SPI_CSN_WAVE) - unit: Wavefronts - tips: - Saved Wavefronts: - avg: AVG(SQ_WAVES_SAVED) - min: MIN(SQ_WAVES_SAVED) - max: MAX(SQ_WAVES_SAVED) - unit: Wavefronts - tips: - Restored Wavefronts: - avg: AVG(SQ_WAVES_RESTORED) - min: MIN(SQ_WAVES_RESTORED) - max: MAX(SQ_WAVES_RESTORED) - unit: Wavefronts - tips: - VGPRs: - avg: AVG(Arch_VGPR) - min: MIN(Arch_VGPR) - max: MAX(Arch_VGPR) - unit: Registers - tips: - AGPRs: - avg: AVG(Accum_VGPR) - min: MIN(Accum_VGPR) - max: MAX(Accum_VGPR) - unit: Registers - tips: - SGPRs: - avg: AVG(SGPR) - min: MIN(SGPR) - max: MAX(SGPR) - unit: Registers - tips: - LDS Allocation: - avg: AVG(LDS_Per_Workgroup) - min: MIN(LDS_Per_Workgroup) - max: MAX(LDS_Per_Workgroup) - unit: Bytes - tips: - Scratch Allocation: - avg: AVG(Scratch_Per_Workitem) - min: MIN(Scratch_Per_Workitem) - max: MAX(Scratch_Per_Workitem) - unit: Bytes/Workitem - tips: - - - metric_table: - id: 702 - title: Wavefront Runtime Stats - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Kernel Time: - avg: AVG((End_Timestamp - Start_Timestamp)) - min: MIN((End_Timestamp - Start_Timestamp)) - max: MAX((End_Timestamp - Start_Timestamp)) - unit: ns - tips: - Kernel Time (Cycles): - avg: AVG($GRBM_GUI_ACTIVE_PER_XCD) - min: MIN($GRBM_GUI_ACTIVE_PER_XCD) - max: MAX($GRBM_GUI_ACTIVE_PER_XCD) - unit: Cycle - tips: - Instructions per wavefront: - avg: AVG((SQ_INSTS / SQ_WAVES)) - min: MIN((SQ_INSTS / SQ_WAVES)) - max: MAX((SQ_INSTS / SQ_WAVES)) - unit: Instr/wavefront - tips: - Wave Cycles: - avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom)) - min: MIN(((4 * SQ_WAVE_CYCLES) / $denom)) - max: MAX(((4 * SQ_WAVE_CYCLES) / $denom)) - unit: (Cycles + $normUnit) - tips: - Dependency Wait Cycles: - avg: AVG(((4 * SQ_WAIT_ANY) / $denom)) - min: MIN(((4 * SQ_WAIT_ANY) / $denom)) - max: MAX(((4 * SQ_WAIT_ANY) / $denom)) - unit: (Cycles + $normUnit) - tips: - Issue Wait Cycles: - avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom)) - min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom)) - max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom)) - unit: (Cycles + $normUnit) - tips: - Active Cycles: - avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom)) - min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom)) - max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom)) - unit: (Cycles + $normUnit) - tips: - Wavefront Occupancy: - avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - unit: Wavefronts - coll_level: SQ_LEVEL_WAVES - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0700_wavefront.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0700_wavefront.yaml new file mode 100644 index 0000000000..5e332c0b8f --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0700_wavefront.yaml @@ -0,0 +1,173 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 700 + title: Wavefront + metrics_description: + Grid Size: The total number of work-items (or, threads) launched as a part of + the kernel dispatch. In HIP, this is equivalent to the total grid size multiplied + by the total workgroup (or, block) size. + Workgroup Size: The total number of work-items (or, threads) in each workgroup + (or, block) launched as part of the kernel dispatch. In HIP, this is equivalent + to the total block size. + Total Wavefronts: "The total number of wavefronts launched as part of the kernel\ + \ dispatch. On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs,\ + \ the wavefront size is always 64 work-items. Thus, the total number of wavefronts\ + \ should be equivalent to the ceiling of grid size divided by 64." + Saved Wavefronts: The total number of wavefronts saved at a context-save. + Restored Wavefronts: The total number of wavefronts restored from a context-save. + VGPRs: 'The number of architected vector general-purpose registers allocated for + the kernel, see VALU. Note: this may not exactly match the number of VGPRs requested + by the compiler due to allocation granularity.' + AGPRs: 'The number of accumulation vector general-purpose registers allocated + for the kernel, see AGPRs. Note: this may not exactly match the number of AGPRs + requested by the compiler due to allocation granularity.' + SGPRs: 'The number of scalar general-purpose registers allocated for the kernel, + see SALU. Note: this may not exactly match the number of SGPRs requested by + the compiler due to allocation granularity.' + LDS Allocation: 'The number of bytes of LDS memory (or, shared memory) allocated + for this kernel. Note: This may also be larger than what was requested at compile + time due to both allocation granularity and dynamic per-dispatch LDS allocations.' + Scratch Allocation: The number of bytes of scratch memory requested per work-item + for this kernel. Scratch memory is used for stack memory on the accelerator, + as well as for register spills and restores. + Kernel Time: The total duration of the executed kernel. + Kernel Time (Cycles): The total duration of the executed kernel in cycles. + Instructions per wavefront: The average number of instructions (of all types) + executed per wavefront. This is averaged over all wavefronts in a kernel dispatch. + Wave Cycles: The number of cycles a wavefront in the kernel dispatch spent resident + on a compute unit per normalization unit. This is averaged over all wavefronts + in a kernel dispatch. + Dependency Wait Cycles: The number of cycles a wavefront in the kernel dispatch + spent resident on a compute unit per normalization unit. This is averaged over + all wavefronts in a kernel dispatch. + Issue Wait Cycles: The number of cycles a wavefront in the kernel dispatch was + unable to issue an instruction for any reason (e.g., execution pipe back-pressure, + arbitration loss, etc.) per normalization unit. This counter is incremented + at every cycle by all wavefronts on a CU unable to issue an instruction. As + such, it is most useful to get a sense of how waves were spending their time, + rather than identification of a precise limiter because another wave could be + actively executing while a wave is issue stalled. The sum of this metric, Dependency + Wait Cycles and Active Cycles should be equal to the total Wave Cycles metric. + Active Cycles: The average number of cycles a wavefront in the kernel dispatch + was actively executing instructions per normalization unit. This measurement + is made on a per-wavefront basis, and may include cycles that another wavefront + spent actively executing (on another execution unit, for example) or was stalled. + As such, it is most useful to get a sense of how waves were spending their time, + rather than identification of a precise limiter. The sum of this metric, Issue + Wait Cycles and Active Wait Cycles should be equal to the total Wave Cycles + metric. + Wavefront Occupancy: 'The time-averaged number of wavefronts resident on the accelerator + over the lifetime of the kernel. Note: this metric may be inaccurate for short-running + kernels (less than 1ms).' + data source: + - metric_table: + id: 701 + title: Wavefront Launch Stats + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Grid Size: + avg: AVG(Grid_Size) + min: MIN(Grid_Size) + max: MAX(Grid_Size) + unit: Work Items + Workgroup Size: + avg: AVG(Workgroup_Size) + min: MIN(Workgroup_Size) + max: MAX(Workgroup_Size) + unit: Work Items + Total Wavefronts: + avg: AVG(SPI_CSN_WAVE) + min: MIN(SPI_CSN_WAVE) + max: MAX(SPI_CSN_WAVE) + unit: Wavefronts + Saved Wavefronts: + avg: AVG(SQ_WAVES_SAVED) + min: MIN(SQ_WAVES_SAVED) + max: MAX(SQ_WAVES_SAVED) + unit: Wavefronts + Restored Wavefronts: + avg: AVG(SQ_WAVES_RESTORED) + min: MIN(SQ_WAVES_RESTORED) + max: MAX(SQ_WAVES_RESTORED) + unit: Wavefronts + VGPRs: + avg: AVG(Arch_VGPR) + min: MIN(Arch_VGPR) + max: MAX(Arch_VGPR) + unit: Registers + AGPRs: + avg: AVG(Accum_VGPR) + min: MIN(Accum_VGPR) + max: MAX(Accum_VGPR) + unit: Registers + SGPRs: + avg: AVG(SGPR) + min: MIN(SGPR) + max: MAX(SGPR) + unit: Registers + LDS Allocation: + avg: AVG(LDS_Per_Workgroup) + min: MIN(LDS_Per_Workgroup) + max: MAX(LDS_Per_Workgroup) + unit: Bytes + Scratch Allocation: + avg: AVG(Scratch_Per_Workitem) + min: MIN(Scratch_Per_Workitem) + max: MAX(Scratch_Per_Workitem) + unit: Bytes/Workitem + - metric_table: + id: 702 + title: Wavefront Runtime Stats + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Kernel Time: + avg: AVG((End_Timestamp - Start_Timestamp)) + min: MIN((End_Timestamp - Start_Timestamp)) + max: MAX((End_Timestamp - Start_Timestamp)) + unit: ns + Kernel Time (Cycles): + avg: AVG($GRBM_GUI_ACTIVE_PER_XCD) + min: MIN($GRBM_GUI_ACTIVE_PER_XCD) + max: MAX($GRBM_GUI_ACTIVE_PER_XCD) + unit: Cycle + Instructions per wavefront: + avg: AVG((SQ_INSTS / SQ_WAVES)) + min: MIN((SQ_INSTS / SQ_WAVES)) + max: MAX((SQ_INSTS / SQ_WAVES)) + unit: Instr/wavefront + Wave Cycles: + avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom)) + min: MIN(((4 * SQ_WAVE_CYCLES) / $denom)) + max: MAX(((4 * SQ_WAVE_CYCLES) / $denom)) + unit: (Cycles + $normUnit) + Dependency Wait Cycles: + avg: AVG(((4 * SQ_WAIT_ANY) / $denom)) + min: MIN(((4 * SQ_WAIT_ANY) / $denom)) + max: MAX(((4 * SQ_WAIT_ANY) / $denom)) + unit: (Cycles + $normUnit) + Issue Wait Cycles: + avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom)) + min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom)) + max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom)) + unit: (Cycles + $normUnit) + Active Cycles: + avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom)) + min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom)) + max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom)) + unit: (Cycles + $normUnit) + Wavefront Occupancy: + avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + unit: Wavefronts + coll_level: SQ_LEVEL_WAVES diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1000_compute-unit-instruction-mix.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1000_compute-unit-instruction-mix.yaml deleted file mode 100644 index 045f217ad8..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1000_compute-unit-instruction-mix.yaml +++ /dev/null @@ -1,267 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1000 - title: Compute Units - Instruction Mix - data source: - - metric_table: - id: 1001 - title: Overall Instruction Mix - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - VALU: - avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) - min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) - max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) - unit: (instr + $normUnit) - tips: - VMEM: - avg: AVG(((SQ_INSTS_VMEM - SQ_INSTS_FLAT_LDS_ONLY) / $denom)) - min: MIN(((SQ_INSTS_VMEM - SQ_INSTS_FLAT_LDS_ONLY) / $denom)) - max: MAX(((SQ_INSTS_VMEM - SQ_INSTS_FLAT_LDS_ONLY) / $denom)) - unit: (instr + $normUnit) - tips: - LDS: - avg: AVG((SQ_INSTS_LDS / $denom)) - min: MIN((SQ_INSTS_LDS / $denom)) - max: MAX((SQ_INSTS_LDS / $denom)) - unit: (instr + $normUnit) - tips: - MFMA: - avg: AVG((SQ_INSTS_MFMA / $denom)) - min: MIN((SQ_INSTS_MFMA / $denom)) - max: MAX((SQ_INSTS_MFMA / $denom)) - unit: (instr + $normUnit) - tips: - SALU: - avg: AVG((SQ_INSTS_SALU / $denom)) - min: MIN((SQ_INSTS_SALU / $denom)) - max: MAX((SQ_INSTS_SALU / $denom)) - unit: (instr + $normUnit) - tips: - SMEM: - avg: AVG((SQ_INSTS_SMEM / $denom)) - min: MIN((SQ_INSTS_SMEM / $denom)) - max: MAX((SQ_INSTS_SMEM / $denom)) - unit: (instr + $normUnit) - tips: - Branch: - avg: AVG((SQ_INSTS_BRANCH / $denom)) - min: MIN((SQ_INSTS_BRANCH / $denom)) - max: MAX((SQ_INSTS_BRANCH / $denom)) - unit: (instr + $normUnit) - tips: - - - metric_table: - id: 1002 - title: VALU Arithmetic Instr Mix - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - INT32: - avg: AVG((SQ_INSTS_VALU_INT32 / $denom)) - min: MIN((SQ_INSTS_VALU_INT32 / $denom)) - max: MAX((SQ_INSTS_VALU_INT32 / $denom)) - unit: (instr + $normUnit) - tips: - INT64: - avg: AVG((SQ_INSTS_VALU_INT64 / $denom)) - min: MIN((SQ_INSTS_VALU_INT64 / $denom)) - max: MAX((SQ_INSTS_VALU_INT64 / $denom)) - unit: (instr + $normUnit) - tips: - F16-ADD: - avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom)) - unit: (instr + $normUnit) - tips: - F16-MUL: - avg: AVG((SQ_INSTS_VALU_MUL_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_MUL_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_MUL_F16 / $denom)) - unit: (instr + $normUnit) - tips: - F16-FMA: - avg: AVG((SQ_INSTS_VALU_FMA_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_FMA_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_FMA_F16 / $denom)) - unit: (instr + $normUnit) - tips: - F16-Trans: - avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom)) - unit: (instr + $normUnit) - tips: - F32-ADD: - avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom)) - unit: (instr + $normUnit) - tips: - F32-MUL: - avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom)) - unit: (instr + $normUnit) - tips: - F32-FMA: - avg: AVG((SQ_INSTS_VALU_FMA_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_FMA_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_FMA_F32 / $denom)) - unit: (instr + $normUnit) - tips: - F32-Trans: - avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom)) - unit: (instr + $normUnit) - tips: - F64-ADD: - avg: AVG((SQ_INSTS_VALU_ADD_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_ADD_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_ADD_F64 / $denom)) - unit: (instr + $normUnit) - tips: - F64-MUL: - avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom)) - unit: (instr + $normUnit) - tips: - F64-FMA: - avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom)) - unit: (instr + $normUnit) - tips: - F64-Trans: - avg: AVG((SQ_INSTS_VALU_TRANS_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_TRANS_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_TRANS_F64 / $denom)) - unit: (instr + $normUnit) - tips: - Conversion: - avg: AVG((SQ_INSTS_VALU_CVT / $denom)) - min: MIN((SQ_INSTS_VALU_CVT / $denom)) - max: MAX((SQ_INSTS_VALU_CVT / $denom)) - unit: (instr + $normUnit) - tips: - - - metric_table: - id: 1003 - title: VMEM Instr Mix - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Global/Generic Instr: - avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Global/Generic Read: - avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Global/Generic Write: - avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Global/Generic Atomic: - avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Spill/Stack Instr: - avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Spill/Stack Read: - avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Spill/Stack Write: - avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Spill/Stack Atomic: - avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - - - metric_table: - id: 1004 - title: MFMA Arithmetic Instr Mix - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - MFMA-I8: - avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom)) - unit: (instr + $normUnit) - tips: - MFMA-F16: - avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom)) - unit: (instr + $normUnit) - tips: - MFMA-BF16: - avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom)) - unit: (instr + $normUnit) - tips: - MFMA-F32: - avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom)) - unit: (instr + $normUnit) - tips: - MFMA-F64: - avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom)) - unit: (instr + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1000_compute_units_instruction_mix.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1000_compute_units_instruction_mix.yaml new file mode 100644 index 0000000000..69748199b5 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1000_compute_units_instruction_mix.yaml @@ -0,0 +1,304 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1000 + title: Compute Units - Instruction Mix + metrics_description: + VALU: The total number of vector arithmetic logic unit (VALU) operations issued. + These are the workhorses of the compute unit, and are used to execute a wide + range of instruction types including floating point operations, non-uniform + address calculations, transcendental operations, integer operations, shifts, + conditional evaluation, etc. + VMEM: The total number of vector memory operations issued. These include most + loads, stores and atomic operations and all accesses to generic, global, private + and texture memory. + LDS: The total number of LDS (also known as shared memory) operations issued. + These include loads, stores, atomics, and HIP's __shfl operations. + MFMA: The total number of matrix fused multiply-add instructions issued. + SALU: The total number of scalar arithmetic logic unit (SALU) operations issued. + Typically these are used for address calculations, literal constants, and other + operations that are provably uniform across a wavefront. Although scalar memory + (SMEM) operations are issued by the SALU, they are counted separately in this + section. + SMEM: The total number of scalar memory (SMEM) operations issued. These are typically + used for loading kernel arguments, base-pointers and loads from HIP's __constant__ + memory. + Branch: The total number of branch operations issued. These typically consist + of jump or branch operations and are used to implement control flow. + INT32: The total number of instructions operating on 32-bit integer operands issued + to the VALU per normalization unit. + INT64: The total number of instructions operating on 64-bit integer operands issued + to the VALU per normalization unit. + F16-ADD: The total number of addition instructions operating on 16-bit floating-point + operands issued to the VALU per normalization unit. + F16-MUL: The total number of multiplication instructions operating on 16-bit floating-point + operands issued to the VALU per normalization unit. + F16-FMA: The total number of fused multiply-add instructions operating on 16-bit + floating-point operands issued to the VALU per normalization unit. + F16-Trans: The total number of transcendental instructions (e.g., sqrt) operating + on 16-bit floating-point operands issued to the VALU per normalization unit. + F32-ADD: The total number of addition instructions operating on 32-bit floating-point + operands issued to the VALU per normalization unit. + F32-MUL: The total number of multiplication instructions operating on 32-bit floating-point + operands issued to the VALU per normalization unit. + F32-FMA: The total number of fused multiply-add instructions operating on 32-bit + floating-point operands issued to the VALU per normalization unit. + F32-Trans: The total number of transcendental instructions (such as sqrt) operating + on 32-bit floating-point operands issued to the VALU per normalization unit. + F64-ADD: The total number of addition instructions operating on 64-bit floating-point + operands issued to the VALU per normalization unit. + F64-MUL: The total number of multiplication instructions operating on 64-bit floating-point + operands issued to the VALU per normalization unit. + F64-FMA: The total number of fused multiply-add instructions operating on 64-bit + floating-point operands issued to the VALU per normalization unit. + F64-Trans: The total number of transcendental instructions (such as sqrt) operating + on 64-bit floating-point operands issued to the VALU per normalization unit. + Conversion: "The total number of type conversion instructions (such as converting\ + \ data to or from F32\u2194F64) issued to the VALU per normalization unit." + Global/Generic Instr: The total number of global & generic memory instructions + executed on all compute units on the accelerator, per normalization unit. + Global/Generic Read: The total number of global & generic memory read instructions + executed on all compute units on the accelerator, per normalization unit. + Global/Generic Write: The total number of global & generic memory write instructions + executed on all compute units on the accelerator, per normalization unit. + Global/Generic Atomic: The total number of global & generic memory atomic (with + and without return) instructions executed on all compute units on the accelerator, + per normalization unit. + Spill/Stack Instr: The total number of spill/stack memory instructions executed + on all compute units on the accelerator, per normalization unit. + Spill/Stack Read: The total number of spill/stack memory read instructions executed + on all compute units on the accelerator, per normalization unit. + Spill/Stack Write: The total number of spill/stack memory write instructions executed + on all compute units on the accelerator, per normalization unit. + Spill/Stack Atomic: The total number of spill/stack memory atomic (with and without + return) instructions executed on all compute units on the accelerator, per normalization + unit. Typically unused as these memory operations are typically used to implement + thread-local storage. + MFMA-I8: The total number of 8-bit integer MFMA instructions issued per normalization + unit. + MFMA-F8: The total number of 8-bit floating point MFMA instructions issued per + normalization unit. This is supported in AMD Instinct MI300 series and later + only. + MFMA-F16: The total number of 16-bit floating point MFMA instructions issued per + normalization unit. + MFMA-BF16: The total number of 16-bit brain floating point MFMA instructions issued + per normalization unit. + MFMA-F32: The total number of 32-bit floating-point MFMA instructions issued per + normalization unit. + MFMA-F64: The total number of 64-bit floating-point MFMA instructions issued per + normalization unit. + data source: + - metric_table: + id: 1001 + title: Overall Instruction Mix + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + VALU: + avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) + min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) + max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) + unit: (instr + $normUnit) + VMEM: + avg: AVG(((SQ_INSTS_VMEM - SQ_INSTS_FLAT_LDS_ONLY) / $denom)) + min: MIN(((SQ_INSTS_VMEM - SQ_INSTS_FLAT_LDS_ONLY) / $denom)) + max: MAX(((SQ_INSTS_VMEM - SQ_INSTS_FLAT_LDS_ONLY) / $denom)) + unit: (instr + $normUnit) + LDS: + avg: AVG((SQ_INSTS_LDS / $denom)) + min: MIN((SQ_INSTS_LDS / $denom)) + max: MAX((SQ_INSTS_LDS / $denom)) + unit: (instr + $normUnit) + MFMA: + avg: AVG((SQ_INSTS_MFMA / $denom)) + min: MIN((SQ_INSTS_MFMA / $denom)) + max: MAX((SQ_INSTS_MFMA / $denom)) + unit: (instr + $normUnit) + SALU: + avg: AVG((SQ_INSTS_SALU / $denom)) + min: MIN((SQ_INSTS_SALU / $denom)) + max: MAX((SQ_INSTS_SALU / $denom)) + unit: (instr + $normUnit) + SMEM: + avg: AVG((SQ_INSTS_SMEM / $denom)) + min: MIN((SQ_INSTS_SMEM / $denom)) + max: MAX((SQ_INSTS_SMEM / $denom)) + unit: (instr + $normUnit) + Branch: + avg: AVG((SQ_INSTS_BRANCH / $denom)) + min: MIN((SQ_INSTS_BRANCH / $denom)) + max: MAX((SQ_INSTS_BRANCH / $denom)) + unit: (instr + $normUnit) + - metric_table: + id: 1002 + title: VALU Arithmetic Instruction Mix + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + INT32: + avg: AVG((SQ_INSTS_VALU_INT32 / $denom)) + min: MIN((SQ_INSTS_VALU_INT32 / $denom)) + max: MAX((SQ_INSTS_VALU_INT32 / $denom)) + unit: (instr + $normUnit) + INT64: + avg: AVG((SQ_INSTS_VALU_INT64 / $denom)) + min: MIN((SQ_INSTS_VALU_INT64 / $denom)) + max: MAX((SQ_INSTS_VALU_INT64 / $denom)) + unit: (instr + $normUnit) + F16-ADD: + avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom)) + unit: (instr + $normUnit) + F16-MUL: + avg: AVG((SQ_INSTS_VALU_MUL_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_MUL_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_MUL_F16 / $denom)) + unit: (instr + $normUnit) + F16-FMA: + avg: AVG((SQ_INSTS_VALU_FMA_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_FMA_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_FMA_F16 / $denom)) + unit: (instr + $normUnit) + F16-Trans: + avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom)) + unit: (instr + $normUnit) + F32-ADD: + avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom)) + unit: (instr + $normUnit) + F32-MUL: + avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom)) + unit: (instr + $normUnit) + F32-FMA: + avg: AVG((SQ_INSTS_VALU_FMA_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_FMA_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_FMA_F32 / $denom)) + unit: (instr + $normUnit) + F32-Trans: + avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom)) + unit: (instr + $normUnit) + F64-ADD: + avg: AVG((SQ_INSTS_VALU_ADD_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_ADD_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_ADD_F64 / $denom)) + unit: (instr + $normUnit) + F64-MUL: + avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom)) + unit: (instr + $normUnit) + F64-FMA: + avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom)) + unit: (instr + $normUnit) + F64-Trans: + avg: AVG((SQ_INSTS_VALU_TRANS_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_TRANS_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_TRANS_F64 / $denom)) + unit: (instr + $normUnit) + Conversion: + avg: AVG((SQ_INSTS_VALU_CVT / $denom)) + min: MIN((SQ_INSTS_VALU_CVT / $denom)) + max: MAX((SQ_INSTS_VALU_CVT / $denom)) + unit: (instr + $normUnit) + - metric_table: + id: 1003 + title: VMEM Instruction Mix + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Global/Generic Instr: + avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Global/Generic Read: + avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Global/Generic Write: + avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Global/Generic Atomic: + avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Instr: + avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Read: + avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Write: + avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Atomic: + avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + - metric_table: + id: 1004 + title: MFMA Arithmetic Instruction Mix + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + MFMA-I8: + avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom)) + unit: (instr + $normUnit) + MFMA-F16: + avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom)) + unit: (instr + $normUnit) + MFMA-BF16: + avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + unit: (instr + $normUnit) + MFMA-F32: + avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom)) + unit: (instr + $normUnit) + MFMA-F64: + avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom)) + unit: (instr + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1100_compute-unit-compute-pipeline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1100_compute-unit-compute-pipeline.yaml deleted file mode 100644 index c54a6703e8..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1100_compute-unit-compute-pipeline.yaml +++ /dev/null @@ -1,260 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1100 - title: Compute Units - Compute Pipeline - data source: - - metric_table: - id: 1101 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - peak: Peak - pop: Pct of Peak - tips: Tips - metric: - VALU FLOPs: - value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) - + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) - + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + (64 * (((SQ_INSTS_VALU_ADD_F64 - + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (2 * SQ_INSTS_VALU_FMA_F64)))) - / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) / (((($max_sclk - * $cu_per_gpu) * 64) * 2) / 1000)) - tips: - VALU IOPs: - value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - Start_Timestamp))) - unit: GIOP - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) - tips: - MFMA FLOPs (BF16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 1024) / 1000)) - tips: - MFMA FLOPs (F16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 1024) / 1000)) - tips: - MFMA FLOPs (F32): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - tips: - MFMA FLOPs (F64): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - tips: - MFMA IOPs (INT8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GIOP - peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 1024) / 1000)) - tips: - - - metric_table: - id: 1102 - title: Pipeline Stats - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - IPC: - avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - unit: Instr/cycle - tips: - IPC (Issued): - avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) - + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) - / SQ_ACTIVE_INST_ANY)) - min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) - + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) - / SQ_ACTIVE_INST_ANY)) - max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) - + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) - / SQ_ACTIVE_INST_ANY)) - unit: Instr/cycle - tips: - SALU Utilization: - avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - tips: - VALU Utilization: - avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - tips: - VMEM Utilization: - avg: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - tips: - Branch Utilization: - avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - tips: - VALU Active Threads: - avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - unit: Threads - tips: - MFMA Utilization: - avg: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) - min: MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) - max: MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) - unit: pct - tips: - MFMA Instr Cycles: - avg: AVG(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA != 0) - else None)) - min: MIN(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA != 0) - else None)) - max: MAX(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA != 0) - else None)) - unit: cycles/instr - tips: - VMEM Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) - else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) - else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) - else None)) - unit: Cycles - coll_level: SQ_INST_LEVEL_VMEM - tips: - SMEM Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) - else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) - else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) - else None)) - unit: Cycles - coll_level: SQ_INST_LEVEL_SMEM - tips: - - - metric_table: - id: 1103 - title: Arithmetic Operations - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - FLOPs (Total): - avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) - + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 - * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) - + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / - $denom)) - min: MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) - + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 - * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) - + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / - $denom)) - max: MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) - + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 - * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) - + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / - $denom)) - unit: (OPs + $normUnit) - tips: - IOPs (Total): - avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / $denom) - min: MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / $denom) - max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / $denom) - unit: (OPs + $normUnit) - tips: - F16 OPs: - avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + - (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 * - SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) - min: MIN(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + - (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 * - SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) - max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + - (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 * - SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) - unit: (OPs + $normUnit) - tips: - BF16 OPs: - avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) - min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) - max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) - unit: (OPs + $normUnit) - tips: - F32 OPs: - avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) / $denom)) - min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) / $denom)) - max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) / $denom)) - unit: (OPs + $normUnit) - tips: - F64 OPs: - avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) - min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) - max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) - unit: (OPs + $normUnit) - tips: - INT8 OPs: - avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) - min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) - max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) - unit: (OPs + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1100_compute_units_compute_pipeline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1100_compute_units_compute_pipeline.yaml new file mode 100644 index 0000000000..81c0197225 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1100_compute_units_compute_pipeline.yaml @@ -0,0 +1,316 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1100 + title: Compute Units - Compute Pipeline + metrics_description: + VALU FLOPs: 'The total floating-point operations executed per second on the VALU. + This is also presented as a percent of the peak theoretical FLOPs achievable + on the specific accelerator. Note: this does not include any floating-point + operations from MFMA instructions.' + VALU IOPs: 'The total integer operations executed per second on the VALU. This + is also presented as a percent of the peak theoretical IOPs achievable on the + specific accelerator. Note: this does not include any integer operations from + MFMA instructions.' + MFMA FLOPs (BF16): 'The total number of 16-bit brain floating point MFMA operations + executed per second. Note: this does not include any 16-bit brain floating point + operations from VALU instructions. This is also presented as a percent of the + peak theoretical BF16 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F16): 'The total number of 16-bit floating point MFMA operations executed + per second. Note: this does not include any 16-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F16 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F32): 'The total number of 32-bit floating point MFMA operations executed + per second. Note: this does not include any 32-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F32 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F64): 'The total number of 64-bit floating point MFMA operations executed + per second. Note: this does not include any 64-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F64 MFMA operations achievable on the specific accelerator.' + MFMA IOPs (INT8): 'The total number of 8-bit integer MFMA operations executed + per second. Note: this does not include any 8-bit integer operations from VALU + instructions. This is also presented as a percent of the peak theoretical INT8 + MFMA operations achievable on the specific accelerator.' + IPC: The ratio of the total number of instructions executed on the CU over the + total active CU cycles. + IPC (Issued): The ratio of the total number of (non-internal) instructions issued + over the number of cycles where the scheduler was actively working on issuing + instructions. + SALU Utilization: Indicates what percent of the kernel's duration the SALU was + busy executing instructions. Computed as the ratio of the total number of cycles + spent by the scheduler issuing SALU / SMEM instructions over the total CU cycles. + VALU Utilization: Indicates what percent of the kernel's duration the VALU was + busy executing instructions. Does not include VMEM operations. Computed as the + ratio of the total number of cycles spent by the scheduler issuing VALU instructions + over the total CU cycles. + VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit + was busy executing instructions, including both global/generic and spill/scratch + operations (see the VMEM instruction count metrics for more detail). Does not + include VALU operations. Computed as the ratio of the total number of cycles + spent by the scheduler issuing VMEM instructions over the total CU cycles. + Branch Utilization: Indicates what percent of the kernel's duration the branch + unit was busy executing instructions. Computed as the ratio of the total number + of cycles spent by the scheduler issuing branch instructions over the total + CU cycles. + VALU Active Threads: Indicates the average level of divergence within a wavefront + over the lifetime of the kernel. The number of work-items that were active in + a wavefront during execution of each VALU instruction, time-averaged over all + VALU instructions run on all wavefronts in the kernel + MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit + was busy executing instructions. Computed as the ratio of the total number of + cycles spent by the MFMA was busy over the total CU cycles. + MFMA Instruction Cycles: The average duration of MFMA instructions in this kernel + in cycles. Computed as the ratio of the total number of cycles the MFMA unit + was busy over the total number of MFMA instructions. + VMEM Latency: The average number of round-trip cycles (that is, from issue to + data return / acknowledgment) required for a VMEM instruction to complete. + SMEM Latency: The average number of round-trip cycles (that is, from issue to + data return / acknowledgment) required for a SMEM instruction to complete. + FLOPs (Total): The total number of floating-point operations executed on either + the VALU or MFMA units, per normalization unit. + IOPs (Total): The total number of integer operations executed on either the VALU + or MFMA units, per normalization unit. + F16 OPs: The total number of 16-bit floating-point operations executed on either + the VALU or MFMA units, per normalization unit. + BF16 OPs: The total number of 16-bit brain floating-point operations executed + on either the VALU or MFMA units, per normalization unit. + F32 OPs: The total number of 32-bit floating-point operations executed on either + the VALU or MFMA units, per normalization unit. + F64 OPs: The total number of 64-bit floating-point operations executed on either + the VALU or MFMA units, per normalization unit. + INT8 OPs: The total number of 8-bit integer operations executed on either the + VALU or MFMA units, per normalization unit. + data source: + - metric_table: + id: 1101 + title: Compute Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + peak: Peak + pop: Pct of Peak + metric: + VALU FLOPs: + value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) + / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) + VALU IOPs: + value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp + - Start_Timestamp))) + unit: GIOP + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp + - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) + MFMA FLOPs (BF16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000)) + MFMA FLOPs (F16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000)) + MFMA FLOPs (F32): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + MFMA FLOPs (F64): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + MFMA IOPs (INT8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GIOP + peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000)) + - metric_table: + id: 1102 + title: Pipeline Statistics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + IPC: + avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + unit: Instr/cycle + IPC (Issued): + avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) + / SQ_ACTIVE_INST_ANY)) + min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) + / SQ_ACTIVE_INST_ANY)) + max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) + / SQ_ACTIVE_INST_ANY)) + unit: Instr/cycle + SALU Utilization: + avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + VALU Utilization: + avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + VMEM Utilization: + avg: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + min: MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + max: MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + unit: pct + Branch Utilization: + avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + min: MIN((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + max: MAX((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + VALU Active Threads: + avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + unit: Threads + MFMA Utilization: + avg: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) + min: MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) + max: MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) + unit: pct + MFMA Instruction Cycles: + avg: AVG(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA != + 0) else None)) + min: MIN(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA != + 0) else None)) + max: MAX(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA != + 0) else None)) + unit: cycles/instr + VMEM Latency: + avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) + else None)) + min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) + else None)) + max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) + else None)) + unit: Cycles + coll_level: SQ_INST_LEVEL_VMEM + SMEM Latency: + avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) + else None)) + min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) + else None)) + max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) + else None)) + unit: Cycles + coll_level: SQ_INST_LEVEL_SMEM + - metric_table: + id: 1103 + title: Arithmetic Operations + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + FLOPs (Total): + avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 + * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 + * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) + min: MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 + * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 + * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) + max: MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 + * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 + * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) + unit: (OPs + $normUnit) + IOPs (Total): + avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 + * 512)) / $denom) + min: MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 + * 512)) / $denom) + max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 + * 512)) / $denom) + unit: (OPs + $normUnit) + F16 OPs: + avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 + * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) + min: MIN(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 + * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) + max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 + * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) + unit: (OPs + $normUnit) + BF16 OPs: + avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) + min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) + max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) + unit: (OPs + $normUnit) + F32 OPs: + avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + / $denom)) + min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + / $denom)) + max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + / $denom)) + unit: (OPs + $normUnit) + F64 OPs: + avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + unit: (OPs + $normUnit) + INT8 OPs: + avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + unit: (OPs + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1200_lds.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1200_lds.yaml deleted file mode 100644 index 2c3fc34b2a..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1200_lds.yaml +++ /dev/null @@ -1,118 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1200 - title: Local Data Share (LDS) - data source: - - metric_table: - id: 1201 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - tips: Tips - metric: - Utilization: - value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: Pct of Peak - tips: - Access Rate: - value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: Pct of Peak - tips: - Theoretical Bandwidth: - value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * 0.00128))) - unit: Pct of Peak - tips: - Bank Conflict Rate: - value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - unit: Pct of Peak - tips: - comparable: false # for now - cli_style: simple_bar - - - metric_table: - id: 1202 - title: LDS Stats - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - LDS Instrs: - avg: AVG((SQ_INSTS_LDS / $denom)) - min: MIN((SQ_INSTS_LDS / $denom)) - max: MAX((SQ_INSTS_LDS / $denom)) - unit: (Instr + $normUnit) - tips: - Theoretical Bandwidth: - avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / $denom)) - min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / $denom)) - max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / $denom)) - unit: (Bytes + $normUnit) - tips: - LDS Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)) - unit: Cycles - coll_level: SQ_INST_LEVEL_LDS - tips: - Bank Conflicts/Access: - avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - unit: Conflicts/Access - tips: - Index Accesses: - avg: AVG((SQ_LDS_IDX_ACTIVE / $denom)) - min: MIN((SQ_LDS_IDX_ACTIVE / $denom)) - max: MAX((SQ_LDS_IDX_ACTIVE / $denom)) - unit: (Cycles + $normUnit) - tips: - Atomic Return Cycles: - avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom)) - min: MIN((SQ_LDS_ATOMIC_RETURN / $denom)) - max: MAX((SQ_LDS_ATOMIC_RETURN / $denom)) - unit: (Cycles + $normUnit) - tips: - Bank Conflict: - avg: AVG((SQ_LDS_BANK_CONFLICT / $denom)) - min: MIN((SQ_LDS_BANK_CONFLICT / $denom)) - max: MAX((SQ_LDS_BANK_CONFLICT / $denom)) - unit: (Cycles + $normUnit) - tips: - Addr Conflict: - avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom)) - min: MIN((SQ_LDS_ADDR_CONFLICT / $denom)) - max: MAX((SQ_LDS_ADDR_CONFLICT / $denom)) - unit: (Cycles + $normUnit) - tips: - Unaligned Stall: - avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom)) - min: MIN((SQ_LDS_UNALIGNED_STALL / $denom)) - max: MAX((SQ_LDS_UNALIGNED_STALL / $denom)) - unit: (Cycles + $normUnit) - tips: - Mem Violations: - avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom)) - min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom)) - max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom)) - unit: (Accesses + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1200_local_data_share_lds.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1200_local_data_share_lds.yaml new file mode 100644 index 0000000000..6cfe19d9de --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1200_local_data_share_lds.yaml @@ -0,0 +1,141 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1200 + title: Local Data Share (LDS) + metrics_description: + Utilization: Indicates what percent of the kernel's duration the LDS was actively + executing instructions (including, but not limited to, load, store, atomic and + HIP's __shfl operations). Calculated as the ratio of the total number of cycles + LDS was active over the total CU cycles. + Access Rate: Indicates the percentage of SIMDs in the VALU actively issuing LDS + instructions, averaged over the lifetime of the kernel. Calculated as the ratio + of the total number of cycles spent by the scheduler issuing LDS instructions + over the total CU cycles. + Theoretical Bandwidth: Indicates the maximum amount of bytes that could have been + loaded from, stored to, or atomically updated in the LDS per normalization unit. + Does not take into account the execution mask of the wavefront when the instruction + was executed. + Bank Conflict Rate: Indicates the percentage of active LDS cycles that were spent + servicing bank conflicts. Calculated as the ratio of LDS cycles spent servicing + bank conflicts over the number of LDS cycles that would have been required to + move the same amount of data in an uncontended access. + LDS Instructions: The total number of LDS instructions (including, but not limited + to, read/write/atomics and HIP's __shfl instructions) executed per normalization + unit. + LDS Latency: The average number of round-trip cycles (i.e., from issue to data-return + / acknowledgment) required for an LDS instruction to complete. + Bank Conflicts/Access: The ratio of the number of cycles spent in the LDS scheduler + due to bank conflicts (as determined by the conflict resolution hardware) to + the base number of cycles that would be spent in the LDS scheduler in a completely + uncontended case. This is the unnormalized form of the Bank Conflict Rate. + Index Accesses: The total number of cycles spent in the LDS scheduler over all + operations per normalization unit. + Atomic Return Cycles: The total number of cycles spent on LDS atomics with return + per normalization unit. + Bank Conflict: The total number of cycles spent in the LDS scheduler due to bank + conflicts (as determined by the conflict resolution hardware) per normalization + unit. + Addr Conflict: The total number of cycles spent in the LDS scheduler due to address + conflicts (as determined by the conflict resolution hardware) per normalization + unit. + Unaligned Stall: The total number of cycles spent in the LDS scheduler due to + stalls from non-dword aligned addresses per normalization unit. + Mem Violations: "The total number of out-of-bounds accesses made to the LDS, per\ + \ normalization unit. This is unused and expected to be zero in most configurations\ + \ for modern CDNA\u2122 accelerators." + data source: + - metric_table: + id: 1201 + title: LDS Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + metric: + Utilization: + value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: Pct of Peak + Access Rate: + value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: Pct of Peak + Theoretical Bandwidth: + value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * 0.00128))) + unit: Pct of Peak + Bank Conflict Rate: + value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + unit: Pct of Peak + comparable: false + cli_style: simple_bar + tui_style: simple_bar + - metric_table: + id: 1202 + title: LDS Statistics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + LDS Instructions: + avg: AVG((SQ_INSTS_LDS / $denom)) + min: MIN((SQ_INSTS_LDS / $denom)) + max: MAX((SQ_INSTS_LDS / $denom)) + unit: (Instr + $normUnit) + Theoretical Bandwidth: + avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / $denom)) + min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / $denom)) + max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / $denom)) + unit: (Bytes + $normUnit) + LDS Latency: + avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else + None)) + min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else + None)) + max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else + None)) + unit: Cycles + coll_level: SQ_INST_LEVEL_LDS + Bank Conflicts/Access: + avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + unit: Conflicts/Access + Index Accesses: + avg: AVG((SQ_LDS_IDX_ACTIVE / $denom)) + min: MIN((SQ_LDS_IDX_ACTIVE / $denom)) + max: MAX((SQ_LDS_IDX_ACTIVE / $denom)) + unit: (Cycles + $normUnit) + Atomic Return Cycles: + avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom)) + min: MIN((SQ_LDS_ATOMIC_RETURN / $denom)) + max: MAX((SQ_LDS_ATOMIC_RETURN / $denom)) + unit: (Cycles + $normUnit) + Bank Conflict: + avg: AVG((SQ_LDS_BANK_CONFLICT / $denom)) + min: MIN((SQ_LDS_BANK_CONFLICT / $denom)) + max: MAX((SQ_LDS_BANK_CONFLICT / $denom)) + unit: (Cycles + $normUnit) + Addr Conflict: + avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom)) + min: MIN((SQ_LDS_ADDR_CONFLICT / $denom)) + max: MAX((SQ_LDS_ADDR_CONFLICT / $denom)) + unit: (Cycles + $normUnit) + Unaligned Stall: + avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom)) + min: MIN((SQ_LDS_UNALIGNED_STALL / $denom)) + max: MAX((SQ_LDS_UNALIGNED_STALL / $denom)) + unit: (Cycles + $normUnit) + Mem Violations: + avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom)) + min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom)) + max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom)) + unit: (Accesses + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1300_instruction-cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1300_instruction-cache.yaml deleted file mode 100644 index 209a42726e..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1300_instruction-cache.yaml +++ /dev/null @@ -1,105 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1300 - title: Instruction Cache - data source: - - metric_table: - id: 1301 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - tips: Tips - metric: - Bandwidth: - value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) - * (End_Timestamp - Start_Timestamp)))) - unit: Pct of Peak - tips: - Cache Hit Rate: - value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - unit: Pct of Peak - tips: - L1I-L2 Bandwidth: - value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu) - * (End_Timestamp - Start_Timestamp)))) - unit: Pct of Peak - tips: - comparable: false # for now - cli_style: simple_bar - - - metric_table: - id: 1302 - title: Instruction Cache Accesses - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Req: - avg: AVG((SQC_ICACHE_REQ / $denom)) - min: MIN((SQC_ICACHE_REQ / $denom)) - max: MAX((SQC_ICACHE_REQ / $denom)) - unit: (Req + $normUnit) - tips: - Hits: - avg: AVG((SQC_ICACHE_HITS / $denom)) - min: MIN((SQC_ICACHE_HITS / $denom)) - max: MAX((SQC_ICACHE_HITS / $denom)) - unit: (Hits + $normUnit) - tips: - Misses - Non Duplicated: - avg: AVG((SQC_ICACHE_MISSES / $denom)) - min: MIN((SQC_ICACHE_MISSES / $denom)) - max: MAX((SQC_ICACHE_MISSES / $denom)) - unit: (Misses + $normUnit) - tips: - Misses - Duplicated: - avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom)) - min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom)) - max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom)) - unit: (Misses + $normUnit) - tips: - Cache Hit Rate: - avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - unit: pct - tips: - Instruction Fetch Latency: - avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - unit: Cycles - coll_level: SQ_IFETCH_LEVEL - tips: - - metric_table: - id: 1303 - title: Instruction Cache - L2 Interface - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - L1I-L2 Bandwidth: - avg: AVG(((SQC_TC_INST_REQ * 64) / $denom)) - min: MIN(((SQC_TC_INST_REQ * 64) / $denom)) - max: MAX(((SQC_TC_INST_REQ * 64) / $denom)) - unit: (Bytes + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1300_instruction_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1300_instruction_cache.yaml new file mode 100644 index 0000000000..a53c23691f --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1300_instruction_cache.yaml @@ -0,0 +1,106 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1300 + title: Instruction Cache + metrics_description: + Bandwidth: The number of bytes looked up in the L1I cache, as a percent of the + peak theoretical bandwidth. Calculated as the ratio of L1I requests over the + total L1I cycles. + Cache Hit Rate: The percent of L1I requests that hit [#l1i-cache]_ on a previously + loaded line the cache. Calculated as the ratio of the number of L1I requests + that hit over the number of all L1I requests. + L1I-L2 Bandwidth: "The percent of the peak theoretical L1I \u2192 L2 cache request\ + \ bandwidth achieved. Calculated as the ratio of the total number of requests\ + \ from the L1I to the L2 cache over the total L1I-L2 interface cycles." + Req: The total number of requests made to the L1I per normalization-unit + Hits: The total number of L1I requests that hit on a previously loaded cache line, + per normalization-unit. + Misses - Non Duplicated: The total number of L1I requests that missed on a cache + line that were not already pending due to another request, per normalization-unit. + Misses - Duplicated: The total number of L1I requests that missed on a cache line + that were already pending due to another request, per normalization-unit. + Instruction Fetch Latency: The average number of cycles spent to fetch instructions + to a CU. + data source: + - metric_table: + id: 1301 + title: L1I Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + metric: + Bandwidth: + value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * (End_Timestamp + - Start_Timestamp)))) + unit: Pct of Peak + Cache Hit Rate: + value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + unit: Pct of Peak + L1I-L2 Bandwidth: + value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu) + * (End_Timestamp - Start_Timestamp)))) + unit: Pct of Peak + comparable: false + cli_style: simple_bar + tui_style: simple_bar + - metric_table: + id: 1302 + title: L1I cache accesses + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Req: + avg: AVG((SQC_ICACHE_REQ / $denom)) + min: MIN((SQC_ICACHE_REQ / $denom)) + max: MAX((SQC_ICACHE_REQ / $denom)) + unit: (Req + $normUnit) + Hits: + avg: AVG((SQC_ICACHE_HITS / $denom)) + min: MIN((SQC_ICACHE_HITS / $denom)) + max: MAX((SQC_ICACHE_HITS / $denom)) + unit: (Hits + $normUnit) + Misses - Non Duplicated: + avg: AVG((SQC_ICACHE_MISSES / $denom)) + min: MIN((SQC_ICACHE_MISSES / $denom)) + max: MAX((SQC_ICACHE_MISSES / $denom)) + unit: (Misses + $normUnit) + Misses - Duplicated: + avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom)) + min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom)) + max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom)) + unit: (Misses + $normUnit) + Cache Hit Rate: + avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + unit: pct + Instruction Fetch Latency: + avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + unit: Cycles + coll_level: SQ_IFETCH_LEVEL + - metric_table: + id: 1303 + title: L1I <-> L2 interface + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + L1I-L2 Bandwidth: + avg: AVG(((SQC_TC_INST_REQ * 64) / $denom)) + min: MIN(((SQC_TC_INST_REQ * 64) / $denom)) + max: MAX(((SQC_TC_INST_REQ * 64) / $denom)) + unit: (Bytes + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1400_constant-cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1400_constant-cache.yaml deleted file mode 100644 index 669a5834b9..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1400_constant-cache.yaml +++ /dev/null @@ -1,171 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1400 - title: Scalar L1 Data Cache - data source: - - metric_table: - id: 1401 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - tips: Tips - metric: - Bandwidth: - value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) - * (End_Timestamp - Start_Timestamp)))) - unit: Pct of Peak - tips: - Cache Hit Rate: - value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES + SQC_DCACHE_MISSES_DUPLICATE)) - if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - unit: Pct of Peak - tips: - sL1D-L2 BW: - value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 100000) - / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp))) - unit: Pct of Peak - tips: - comparable: false # for now - cli_style: simple_bar - - - metric_table: - id: 1402 - title: Scalar L1D Cache Accesses - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Req: - avg: AVG((SQC_DCACHE_REQ / $denom)) - min: MIN((SQC_DCACHE_REQ / $denom)) - max: MAX((SQC_DCACHE_REQ / $denom)) - unit: (Req + $normUnit) - tips: - Hits: - avg: AVG((SQC_DCACHE_HITS / $denom)) - min: MIN((SQC_DCACHE_HITS / $denom)) - max: MAX((SQC_DCACHE_HITS / $denom)) - unit: (Req + $normUnit) - tips: - Misses - Non Duplicated: - avg: AVG((SQC_DCACHE_MISSES / $denom)) - min: MIN((SQC_DCACHE_MISSES / $denom)) - max: MAX((SQC_DCACHE_MISSES / $denom)) - unit: (Req + $normUnit) - tips: - Misses- Duplicated: - avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom)) - min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom)) - max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom)) - unit: (Req + $normUnit) - tips: - Cache Hit Rate: - avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - unit: pct - tips: - Read Req (Total): - avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) - + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) - min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) - + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) - max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) - + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) - unit: (Req + $normUnit) - tips: - Atomic Req: - avg: AVG((SQC_DCACHE_ATOMIC / $denom)) - min: MIN((SQC_DCACHE_ATOMIC / $denom)) - max: MAX((SQC_DCACHE_ATOMIC / $denom)) - unit: (Req + $normUnit) - tips: - Read Req (1 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_1 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_1 / $denom)) - unit: (Req + $normUnit) - tips: - Read Req (2 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_2 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_2 / $denom)) - unit: (Req + $normUnit) - tips: - Read Req (4 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_4 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_4 / $denom)) - unit: (Req + $normUnit) - tips: - Read Req (8 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_8 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_8 / $denom)) - unit: (Req + $normUnit) - tips: - Read Req (16 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_16 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_16 / $denom)) - unit: (Req + $normUnit) - tips: - - - metric_table: - id: 1403 - title: Scalar L1D Cache - L2 Interface - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - sL1D-L2 BW: - avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom)) - min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom)) - max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom)) - unit: (Bytes + $normUnit) - tips: - Read Req: - avg: AVG((SQC_TC_DATA_READ_REQ / $denom)) - min: MIN((SQC_TC_DATA_READ_REQ / $denom)) - max: MAX((SQC_TC_DATA_READ_REQ / $denom)) - unit: (Req + $normUnit) - tips: - Write Req: - avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom)) - min: MIN((SQC_TC_DATA_WRITE_REQ / $denom)) - max: MAX((SQC_TC_DATA_WRITE_REQ / $denom)) - unit: (Req + $normUnit) - tips: - Atomic Req: - avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)) - min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom)) - max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom)) - unit: (Req + $normUnit) - tips: - Stall Cycles: - avg: AVG((SQC_TC_STALL / $denom)) - min: MIN((SQC_TC_STALL / $denom)) - max: MAX((SQC_TC_STALL / $denom)) - unit: (Cycles + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1400_scalar_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1400_scalar_l1_data_cache.yaml new file mode 100644 index 0000000000..d43157ce8e --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1400_scalar_l1_data_cache.yaml @@ -0,0 +1,186 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1400 + title: Scalar L1 Data Cache + metrics_description: + Bandwidth: The number of bytes looked up in the sL1D cache, as a percent of the + peak theoretical bandwidth. Calculated as the ratio of sL1D requests over the + total sL1D cycles. + Cache Hit Rate: Indicates the percent of sL1D requests that hit on a previously + loaded line the cache. The ratio of the number of sL1D requests that hit over + the number of all sL1D requests. + sL1D-L2 BW: "The total number of bytes read from, written to, or atomically updated\ + \ across the sL1D\u2194L2 interface, per normalization unit. Note that sL1D\ + \ writes and atomics are typically unused on current CDNA accelerators, so in\ + \ the majority of cases this can be interpreted as an sL1D\u2192L2 read bandwidth." + Req: The total number of requests, of any size or type, made to the sL1D per normalization + unit. + Hits: The total number of sL1D requests that hit on a previously loaded cache + line, per normalization unit. + Misses - Non Duplicated: 'The total number of sL1D requests that missed on a cache + line that was not already pending due to another request, per normalization + unit. ' + Misses- Duplicated: The total number of sL1D requests that missed on a cache line + that was already pending due to another request, per normalization unit. + Read Req (Total): The total number of sL1D read requests of any size, per normalization + unit. + Atomic Req: The total number of atomic requests from sL1D to the L2, per normalization + unit. Typically unused on current CDNA accelerators. + Read Req (1 DWord): The total number of sL1D read requests made for a single dword + of data (4B), per normalization unit. + Read Req (2 DWord): The total number of sL1D read requests made for a two dwords + of data (8B), per normalization unit. + Read Req (4 DWord): The total number of sL1D read requests made for a four dwords + of data (16B), per normalization unit. + Read Req (8 DWord): The total number of sL1D read requests made for a eight dwords + of data (32B), per normalization unit. + Read Req (16 DWord): The total number of sL1D read requests made for a sixteen + dwords of data (64B), per normalization unit. + Read Req: The total number of read requests from sL1D to the L2 per normalization + unit. + Write Req: The total number of write requests from sL1D to the L2, per normalization + unit. Typically unused on current CDNA accelerators. + Stall Cycles: "The total number of cycles the sL1D\u2194L2 interface was stalled,\ + \ per normalization unit." + data source: + - metric_table: + id: 1401 + title: Scalar L1D Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + metric: + Bandwidth: + value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * (End_Timestamp + - Start_Timestamp)))) + unit: Pct of Peak + Cache Hit Rate: + value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES + + SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + unit: Pct of Peak + sL1D-L2 BW: + value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp))) + unit: Pct of Peak + comparable: false + cli_style: simple_bar + tui_style: simple_bar + - metric_table: + id: 1402 + title: Scalar L1D cache accesses + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Req: + avg: AVG((SQC_DCACHE_REQ / $denom)) + min: MIN((SQC_DCACHE_REQ / $denom)) + max: MAX((SQC_DCACHE_REQ / $denom)) + unit: (Req + $normUnit) + Hits: + avg: AVG((SQC_DCACHE_HITS / $denom)) + min: MIN((SQC_DCACHE_HITS / $denom)) + max: MAX((SQC_DCACHE_HITS / $denom)) + unit: (Req + $normUnit) + Misses - Non Duplicated: + avg: AVG((SQC_DCACHE_MISSES / $denom)) + min: MIN((SQC_DCACHE_MISSES / $denom)) + max: MAX((SQC_DCACHE_MISSES / $denom)) + unit: (Req + $normUnit) + Misses- Duplicated: + avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom)) + min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom)) + max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom)) + unit: (Req + $normUnit) + Cache Hit Rate: + avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + unit: pct + Read Req (Total): + avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) + min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) + max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG((SQC_DCACHE_ATOMIC / $denom)) + min: MIN((SQC_DCACHE_ATOMIC / $denom)) + max: MAX((SQC_DCACHE_ATOMIC / $denom)) + unit: (Req + $normUnit) + Read Req (1 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_1 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_1 / $denom)) + unit: (Req + $normUnit) + Read Req (2 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_2 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_2 / $denom)) + unit: (Req + $normUnit) + Read Req (4 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_4 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_4 / $denom)) + unit: (Req + $normUnit) + Read Req (8 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_8 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_8 / $denom)) + unit: (Req + $normUnit) + Read Req (16 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_16 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_16 / $denom)) + unit: (Req + $normUnit) + - metric_table: + id: 1403 + title: Scalar L1D Cache - L2 Interface + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + sL1D-L2 BW: + avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 64)) / $denom)) + min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 64)) / $denom)) + max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 64)) / $denom)) + unit: (Bytes + $normUnit) + Read Req: + avg: AVG((SQC_TC_DATA_READ_REQ / $denom)) + min: MIN((SQC_TC_DATA_READ_REQ / $denom)) + max: MAX((SQC_TC_DATA_READ_REQ / $denom)) + unit: (Req + $normUnit) + Write Req: + avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom)) + min: MIN((SQC_TC_DATA_WRITE_REQ / $denom)) + max: MAX((SQC_TC_DATA_WRITE_REQ / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)) + min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom)) + max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom)) + unit: (Req + $normUnit) + Stall Cycles: + avg: AVG((SQC_TC_STALL / $denom)) + min: MIN((SQC_TC_STALL / $denom)) + max: MAX((SQC_TC_STALL / $denom)) + unit: (Cycles + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1500_TA_and_TD.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1500_TA_and_TD.yaml deleted file mode 100644 index 8994d0b17d..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1500_TA_and_TD.yaml +++ /dev/null @@ -1,174 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1500 - title: Address Processing Unit and Data Return Path (TA/TD) - data source: - - metric_table: - id: 1501 - title: Address Processing Unit - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Address Processing Unit Busy: - avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Address Stall: - avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Data Stall: - avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Data-Processor → Address Stall: - avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Total Instructions: - avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom)) - min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom)) - max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Global/Generic Instructions: - avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Global/Generic Read Instructions: - avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Global/Generic Write Instructions: - avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Global/Generic Atomic Instructions: - avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Spill/Stack Instructions: - avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Spill/Stack Read Instructions: - avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Spill/Stack Write Instructions: - avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Spill/Stack Atomic Instructions: - avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Spill/Stack Total Cycles: - avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) - min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) - max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) - unit: (Cycles + $normUnit) - tips: - Spill/Stack Coalesced Read: - avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) - min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) - max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) - unit: (Cycles + $normUnit) - tips: - Spill/Stack Coalesced Write: - avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) - min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) - max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) - unit: (Cycles + $normUnit) - tips: - - - metric_table: - id: 1502 - title: Data-Return Path - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Data-Return Busy: - avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Cache RAM → Data-Return Stall: - avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Workgroup manager → Data-Return Stall: - avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Coalescable Instructions: - avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom)) - min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom)) - max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Read Instructions: - avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) - / $denom)) - min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) - / $denom)) - max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) - / $denom)) - unit: (Instructions + $normUnit) - tips: - Write Instructions: - avg: AVG((TD_STORE_WAVEFRONT_sum / $denom)) - min: MIN((TD_STORE_WAVEFRONT_sum / $denom)) - max: MAX((TD_STORE_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Atomic Instructions: - avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom)) - min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom)) - max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1500_address_processing_unit_and_data_return_path_ta_td.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1500_address_processing_unit_and_data_return_path_ta_td.yaml new file mode 100644 index 0000000000..4d808aecab --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1500_address_processing_unit_and_data_return_path_ta_td.yaml @@ -0,0 +1,248 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1500 + title: Address Processing Unit and Data Return Path (TA/TD) + metrics_description: + Address Processing Unit Busy: Percent of the total CU cycles the address processor + was busy + Address Stall: Percent of the total CU cycles the address processor was stalled + from sending address requests further into the vL1D pipeline. + Data Stall: Percent of the total CU cycles the address processor was stalled from + sending write/atomic data further into the vL1D pipeline. + "Data-Processor \u2192 Address Stall": Percent of total CU cycles the address + processor was stalled waiting to send command data to the data processor. + Total Instructions: The total number of memory instructions executed by the address + processer over all compute units on the accelerator, per normalization unit. + Global/Generic Instructions: The total number of global & generic memory instructions + executed on all compute units on the accelerator, per normalization unit. + Global/Generic Read Instructions: The total number of global & generic memory + read instructions executed on all compute units on the accelerator, per normalization + unit. + Global/Generic Write Instructions: The total number of global & generic memory + write instructions executed on all compute units on the accelerator, per normalization + unit. + Global/Generic Atomic Instructions: The total number of global & generic memory + atomic (with and without return) instructions executed on all compute units + on the accelerator, per normalization unit. + Spill/Stack Instructions: The total number of spill/stack memory instructions + executed on all compute units on the accelerator, per normalization unit. + Spill/Stack Read Instructions: The total number of spill/stack memory read instructions + executed on all compute units on the accelerator, per normalization unit. + Spill/Stack Write Instructions: The total number of spill/stack memory write instructions + executed on all compute units on the accelerator, per normalization unit. + Spill/Stack Atomic Instructions: The total number of spill/stack memory atomic + (with and without return) instructions executed on all compute units on the + accelerator, per normalization unit. Typically unused as these memory operations + are typically used to implement thread-local storage. + Spill/Stack Total Cycles: The number of cycles the address processing unit spent + working on spill/stack instructions, per normalization unit. + Spill/Stack Coalesced Read: The number of cycles the address processing unit spent + working on coalesced spill/stack read instructions, per normalization unit. + Spill/Stack Coalesced Write: The number of cycles the address processing unit + spent working on coalesced spill/stack write instructions, per normalization + unit. + Data-Return Busy: Percent of the total CU cycles the data-return unit was busy + processing or waiting on data to return to the CU. + "Cache RAM \u2192 Data-Return Stall": Percent of the total CU cycles the data-return + unit was stalled on data to be returned from the vL1D Cache RAM. + "Workgroup manager \u2192 Data-Return Stall": Percent of the total CU cycles the + data-return unit was stalled by the workgroup manager due to initialization + of registers as a part of launching new workgroups. + Coalescable Instructions: The number of instructions submitted to the data-return + unit by the address processor that were found to be coalescable, per normalization + unit. + Read Instructions: The number of read instructions submitted to the data-return + unit by the address processor summed over all compute units on the accelerator, + per normalization unit. This is expected to be the sum of global/generic and + spill/stack reads in the address processor. + Write Instructions: The number of store instructions submitted to the data-return + unit by the address processor summed over all compute units on the accelerator, + per normalization unit. This is expected to be the sum of global/generic and + spill/stack stores in the address processor. + Atomic Instructions: The number of atomic instructions submitted to the data-return + unit by the address processor summed over all compute units on the accelerator, + per normalization unit. This is expected to be the sum of global/generic and + spill/stack atomics in the address processor. + data source: + - metric_table: + id: 1501 + title: Busy and stall metrics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Address Processing Unit Busy: + avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + Address Stall: + avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + unit: pct + Data Stall: + avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + unit: pct + "Data-Processor \u2192 Address Stall": + avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + unit: pct + "Sequencer \u2192 TA Address Stall": + avg: AVG((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) + min: MIN((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) + max: MAX((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) + "Sequencer \u2192 TA Command Stall": + avg: AVG((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) + min: MIN((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) + max: MAX((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) + "Sequencer \u2192 TA Data Stall": + avg: AVG((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) + min: MIN((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) + max: MAX((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) + - metric_table: + id: 1502 + title: Instruction counts + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Total Instructions: + avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom)) + min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom)) + max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Instructions: + avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Read Instructions: + avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Write Instructions: + avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Atomic Instructions: + avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Instructions: + avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Read Instructions: + avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Write Instructions: + avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Atomic Instructions: + avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + - metric_table: + id: 1503 + title: Spill and stack metrics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Spill/Stack Total Cycles: + avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) + min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) + max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) + unit: (Cycles + $normUnit) + Spill/Stack Coalesced Read: + avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) + min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) + max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) + unit: (Cycles + $normUnit) + Spill/Stack Coalesced Write: + avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) + min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) + max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) + unit: (Cycles + $normUnit) + - metric_table: + id: 1504 + title: Vector L1 data-return path or Texture Data (TD) + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Data-Return Busy: + avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + "Cache RAM \u2192 Data-Return Stall": + avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + "Workgroup manager \u2192 Data-Return Stall": + avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + Coalescable Instructions: + avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom)) + min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom)) + max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) + Read Instructions: + avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) + / $denom)) + min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) + / $denom)) + max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) + / $denom)) + unit: (Instructions + $normUnit) + Write Instructions: + avg: AVG((TD_STORE_WAVEFRONT_sum / $denom)) + min: MIN((TD_STORE_WAVEFRONT_sum / $denom)) + max: MAX((TD_STORE_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) + Atomic Instructions: + avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom)) + min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom)) + max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1600_L1_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1600_L1_cache.yaml deleted file mode 100644 index 5c14bae452..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1600_L1_cache.yaml +++ /dev/null @@ -1,414 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1600 - title: Vector L1 Data Cache - data source: - - metric_table: - id: 1601 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - tips: Tips - metric: - Hit rate: - value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - unit: Pct of Peak - tips: - Bandwidth: - value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk / 1000) * 64) * $cu_per_gpu)) - unit: Pct of Peak - tips: - Utilization: - value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None)) - unit: Pct of Peak - tips: - Coalescing: - value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum - * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None)) - unit: Pct of Peak - tips: - comparable: false # for now - cli_style: simple_bar - - - metric_table: - id: 1602 - title: L1D Cache Stalls (%) - header: - metric: Metric - expr: Expression - tips: Tips - metric: - Stalled on L2 Data: - expr: - (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None) - tips: - Stalled on L2 Req: - expr: - (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None) - tips: - Stalled on Address: - expr: - None - tips: - Stalled on Data: - expr: - None - tips: - Stalled on Latency FIFO: - expr: - None - tips: - Stalled on Request FIFO: - expr: - None - tips: - Stalled on Read Return: - expr: - None - tips: - Tag RAM Stall (Read): - expr: - (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None) - tips: - Tag RAM Stall (Write): - expr: - (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None) - tips: - Tag RAM Stall (Atomic): - expr: - (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None) - tips: - cli_style: simple_box - - - metric_table: - id: 1603 - title: L1D Cache Accesses - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Total Req: - avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom)) - min: MIN((TCP_TOTAL_ACCESSES_sum / $denom)) - max: MAX((TCP_TOTAL_ACCESSES_sum / $denom)) - unit: (Req + $normUnit) - tips: - Read Req: - avg: AVG((TCP_TOTAL_READ_sum / $denom)) - min: MIN((TCP_TOTAL_READ_sum / $denom)) - max: MAX((TCP_TOTAL_READ_sum / $denom)) - unit: (Req + $normUnit) - tips: - Write Req: - avg: AVG((TCP_TOTAL_WRITE_sum / $denom)) - min: MIN((TCP_TOTAL_WRITE_sum / $denom)) - max: MAX((TCP_TOTAL_WRITE_sum / $denom)) - unit: (Req + $normUnit) - tips: - Atomic Req: - avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)) - min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)) - max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)) - unit: (Req + $normUnit) - tips: - Cache BW: - avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / $denom)) - min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / $denom)) - max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / $denom)) - unit: (Bytes + $normUnit) - tips: - Cache Hit Rate: - avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / - TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / - TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / - TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - unit: pct - tips: - Cache Accesses: - avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) - min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) - max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) - unit: (Req + $normUnit) - tips: - Cache Hits: - avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / $denom)) - min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / $denom)) - max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / $denom)) - unit: (Req + $normUnit) - tips: - Invalidations: - avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) - min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) - max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) - unit: (Req + $normUnit) - tips: - L1-L2 BW: - avg: AVG(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) - + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) - min: MIN(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) - + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) - max: MAX(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) - + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) - unit: (Bytes + $normUnit) - tips: - L1-L2 Read: - avg: AVG((TCP_TCC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - L1-L2 Write: - avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - L1-L2 Atomic: - avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)) - min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)) - max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)) - unit: (Req + $normUnit) - tips: - L1 Access Latency: - avg: AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum - != 0) else None)) - min: MIN(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum - != 0) else None)) - max: MAX(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum - != 0) else None)) - unit: Cycles - tips: - L1-L2 Read Latency: - avg: AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) - if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else None)) - min: MIN(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) - if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else None)) - max: MAX(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) - if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else None)) - unit: Cycles - tips: - L1-L2 Write Latency: - avg: AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) else - None)) - min: MIN(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) else - None)) - max: MAX(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) else - None)) - unit: Cycles - tips: - - - metric_table: - id: 1604 - title: L1D - L2 Transactions - header: - metric: Metric - xfer: Xfer - coherency: Coherency - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - NC - Read: - xfer: Read - coherency: NC - avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - UC - Read: - xfer: Read - coherency: UC - avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - CC - Read: - xfer: Read - coherency: CC - avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - RW - Read: - xfer: Read - coherency: RW - avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - RW - Write: - xfer: Write - coherency: RW - avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - NC - Write: - xfer: Write - coherency: NC - avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - UC - Write: - xfer: Write - coherency: UC - avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - CC - Write: - xfer: Write - coherency: CC - avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - NC - Atomic: - xfer: Atomic - coherency: NC - avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - UC - Atomic: - xfer: Atomic - coherency: UC - avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - CC - Atomic: - xfer: Atomic - coherency: CC - avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - RW - Atomic: - xfer: Atomic - coherency: RW - avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - - - metric_table: - id: 1605 - title: L1D Addr Translation - header: - metric: Metric - avg: Avg - min: Min - max: Max - units: Units - tips: Tips - metric: - Req: - avg: AVG((TCP_UTCL1_REQUEST_sum / $denom)) - min: MIN((TCP_UTCL1_REQUEST_sum / $denom)) - max: MAX((TCP_UTCL1_REQUEST_sum / $denom)) - units: (Req + $normUnit) - tips: - Inflight Req: - avg: None # Missing perfmon - min: None # Missing perfmon - max: None # Missing perfmon - units: (Req + $normUnit) - tips: - Hit Ratio: - avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) if - (TCP_UTCL1_REQUEST_sum != 0) else None)) - min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) if - (TCP_UTCL1_REQUEST_sum != 0) else None)) - max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) if - (TCP_UTCL1_REQUEST_sum != 0) else None)) - units: pct - tips: - Hits: - avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) - min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) - max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) - units: (Req + $normUnit) - tips: - Translation Misses: - avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) - min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) - max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) - units: (Req + $normUnit) - tips: - Permission Misses: - avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) - min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) - max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) - units: (Req + $normUnit) - tips: - - metric_table: - id: 1606 - title: L1D Addr Translation Stalls - header: - metric: Metric - avg: Avg - min: Min - max: Max - units: Units - tips: Tips - metric: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1600_vector_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1600_vector_l1_data_cache.yaml new file mode 100644 index 0000000000..96e021e378 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1600_vector_l1_data_cache.yaml @@ -0,0 +1,442 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1600 + title: Vector L1 Data Cache + metrics_description: + Hit rate: The ratio of the number of vL1D cache line requests that hit in vL1D + cache over the total number of cache line requests to the vL1D Cache RAM. + Bandwidth: The number of bytes looked up in the vL1D cache as a result of VMEM + instructions, as a percent of the peak theoretical bandwidth achievable on the + specific accelerator. The number of bytes is calculated as the number of cache + lines requested multiplied by the cache line size. This value does not consider + partial requests, so for instance, if only a single value is requested in a + cache line, the data movement will still be counted as a full cache line. + Utilization: Indicates how busy the vL1D Cache RAM was during the kernel execution. + The number of cycles where the vL1D Cache RAM is actively processing any request + divided by the number of cycles where the vL1D is active. + Coalescing: Indicates how well memory instructions were coalesced by the address + processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). Calculated + as the average number of thread-requests generated per instruction divided by + the ideal number of thread-requests per instruction. + Stalled on L2 Data: The ratio of the number of cycles where the vL1D is stalled + waiting for requested data to return from the L2 cache divided by the number + of cycles where the vL1D is active. + Stalled on L2 Req: The ratio of the number of cycles where the vL1D is stalled + waiting to issue a request for data to the L2 cache divided by the number of + cycles where the vL1D is active. + Tag RAM Stall (Read): The ratio of the number of cycles where the vL1D is stalled + due to Read requests with conflicting tags being looked up concurrently, divided + by the number of cycles where the vL1D is active. + Tag RAM Stall (Write): The ratio of the number of cycles where the vL1D is stalled + due to Write requests with conflicting tags being looked up concurrently, divided + by the number of cycles where the vL1D is active. + Tag RAM Stall (Atomic): The ratio of the number of cycles where the vL1D is stalled + due to Atomic requests with conflicting tags being looked up concurrently, divided + by the number of cycles where the vL1D is active. + Total Req: The total number of incoming requests from the address processing unit + after coalescing. + Read Req: The total number of incoming read requests from the address processing + unit after coalescing per normalization unit. + Write Req: The total number of incoming write requests from the address processing + unit after coalescing per normalization unit. + Atomic Req: The total number of incoming atomic requests from the address processing + unit after coalescing per normalization unit. + Cache BW: The number of bytes looked up in the vL1D cache as a result of VMEM + instructions per normalization unit. The number of bytes is calculated as the + number of cache lines requested multiplied by the cache line size. This value + does not consider partial requests, so for instance, if only a single value + is requested in a cache line, the data movement will still be counted as a full + cache line. + Cache Hit Rate: The ratio of the number of vL1D cache line requests that hit in + vL1D cache over the total number of cache line requests to the vL1D Cache RAM. + Cache Accesses: The total number of cache line lookups in the vL1D. + Cache Hits: The number of cache accesses minus the number of outgoing requests + to the L2 cache, that is, the number of cache line requests serviced by the + vL1D Cache RAM per normalization unit. + Invalidations: The number of times the vL1D was issued a write-back invalidate + command during the kernel's execution per normalization unit. This may be triggered + by, for instance, the buffer_wbinvl1 instruction. + L1-L2 BW: The number of bytes transferred across the vL1D-L2 interface as a result + of VMEM instructions, per normalization unit. The number of bytes is calculated + as the number of cache lines requested multiplied by the cache line size. This + value does not consider partial requests, so for instance, if only a single + value is requested in a cache line, the data movement will still be counted + as a full cache line. + L1-L2 Read: The number of read requests for a vL1D cache line that were not satisfied + by the vL1D and must be retrieved from the to the L2 Cache per normalization + unit. + L1-L2 Write: The number of write requests to a vL1D cache line that were sent + through the vL1D to the L2 cache, per normalization unit. + L1-L2 Atomic: The number of atomic requests that are sent through the vL1D to + the L2 cache, per normalization unit. This includes requests for atomics with, + and without return. + L1 Access Latency: Calculated as the average number of cycles that a vL1D cache + line request spent in the vL1D cache pipeline. + L1-L2 Read Latency: Calculated as the average number of cycles that the vL1D cache + took to issue and receive read requests from the L2 Cache. This number also + includes requests for atomics with return values. + L1-L2 Write Latency: Calculated as the average number of cycles that the vL1D + cache took to issue and receive acknowledgement of a write request to the L2 + Cache. This number also includes requests for atomics without return values. + NC - Read: Total read requests with NC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + UC - Read: Total read requests with UC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + CC - Read: Total read requests with CC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + RW - Read: Total read requests with RW mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + RW - Write: Total write requests with RW mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + NC - Write: Total write requests with NC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + UC - Write: Total write requests with UC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + CC - Write: Total write requests with CC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + NC - Atomic: Total atomic requests with NC mtype from this TCP to all TCCs Sum + over TCP instances per normalization unit. + UC - Atomic: Total atomic requests with UC mtype from this TCP to all TCCs Sum + over TCP instances per normalization unit. + CC - Atomic: Total atomic requests with CC mtype from this TCP to all TCCs Sum + over TCP instances per normalization unit. + RW - Atomic: Total atomic requests with RW mtype from this TCP to all TCCs Sum + over TCP instances per normalization unit. + Req: The number of translation requests made to the UTCL1 per normalization unit. + Hit Ratio: The ratio of the number of translation requests that hit in the UTCL1 + divided by the total number of translation requests made to the UTCL1. + Hits: The number of translation requests that hit in the UTCL1, and could be reused, + per normalization unit. + Translation Misses: The total number of translation requests that missed in the + UTCL1 due to translation not being present in the cache, per normalization + unit. + Permission Misses: "The total number of translation requests that missed in the\ + \ UTCL1 due to a permission error, per normalization unit. This is unused and\ + \ expected to be zero in most configurations for modern CDNA\u2122 accelerators." + data source: + - metric_table: + id: 1601 + title: vL1D Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + metric: + Hit rate: + value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None)) + unit: Pct of Peak + Bandwidth: + value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk / 1000) * 64) * $cu_per_gpu)) + unit: Pct of Peak + Utilization: + value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None)) + unit: Pct of Peak + Coalescing: + value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum + * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None)) + unit: Pct of Peak + comparable: false + cli_style: simple_bar + tui_style: simple_bar + - metric_table: + id: 1602 + title: vL1D cache stall metrics + header: + metric: Metric + expr: Expression + metric: + Stalled on L2 Data: + expr: (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None) + Stalled on L2 Req: + expr: (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None) + Tag RAM Stall (Read): + expr: (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + Tag RAM Stall (Write): + expr: (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + Tag RAM Stall (Atomic): + expr: (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1603 + title: vL1D cache access metrics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Total Req: + avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom)) + min: MIN((TCP_TOTAL_ACCESSES_sum / $denom)) + max: MAX((TCP_TOTAL_ACCESSES_sum / $denom)) + unit: (Req + $normUnit) + Read Req: + avg: AVG((TCP_TOTAL_READ_sum / $denom)) + min: MIN((TCP_TOTAL_READ_sum / $denom)) + max: MAX((TCP_TOTAL_READ_sum / $denom)) + unit: (Req + $normUnit) + Write Req: + avg: AVG((TCP_TOTAL_WRITE_sum / $denom)) + min: MIN((TCP_TOTAL_WRITE_sum / $denom)) + max: MAX((TCP_TOTAL_WRITE_sum / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + unit: (Req + $normUnit) + Cache BW: + avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / $denom)) + min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / $denom)) + max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / $denom)) + unit: (Bytes + $normUnit) + Cache Hit Rate: + avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None)) + min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None)) + max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None)) + unit: pct + Cache Accesses: + avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + unit: (Req + $normUnit) + Cache Hits: + avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + unit: (Req + $normUnit) + Invalidations: + avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + unit: (Req + $normUnit) + L1-L2 BW: + avg: AVG(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) + min: MIN(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) + max: MAX(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) + unit: (Bytes + $normUnit) + L1-L2 Read: + avg: AVG((TCP_TCC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + L1-L2 Write: + avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + L1-L2 Atomic: + avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + unit: (Req + $normUnit) + L1 Access Latency: + avg: AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum + != 0) else None)) + min: MIN(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum + != 0) else None)) + max: MAX(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum + != 0) else None)) + unit: Cycles + L1-L2 Read Latency: + avg: AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) + if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else + None)) + min: MIN(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) + if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else + None)) + max: MAX(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) + if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else + None)) + unit: Cycles + L1-L2 Write Latency: + avg: AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) + else None)) + min: MIN(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) + else None)) + max: MAX(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) + else None)) + unit: Cycles + - metric_table: + id: 1604 + title: L1D - L2 Transactions + header: + metric: Metric + xfer: Xfer + coherency: Coherency + avg: Avg + min: Min + max: Max + unit: Unit + metric: + NC - Read: + xfer: Read + coherency: NC + avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC - Read: + xfer: Read + coherency: UC + avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC - Read: + xfer: Read + coherency: CC + avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW - Read: + xfer: Read + coherency: RW + avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW - Write: + xfer: Write + coherency: RW + avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + NC - Write: + xfer: Write + coherency: NC + avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC - Write: + xfer: Write + coherency: UC + avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC - Write: + xfer: Write + coherency: CC + avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + NC - Atomic: + xfer: Atomic + coherency: NC + avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC - Atomic: + xfer: Atomic + coherency: UC + avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC - Atomic: + xfer: Atomic + coherency: CC + avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW - Atomic: + xfer: Atomic + coherency: RW + avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + - metric_table: + id: 1605 + title: L1 Unified Translation Cache (UTCL1) + header: + metric: Metric + avg: Avg + min: Min + max: Max + units: Units + metric: + Req: + avg: AVG((TCP_UTCL1_REQUEST_sum / $denom)) + min: MIN((TCP_UTCL1_REQUEST_sum / $denom)) + max: MAX((TCP_UTCL1_REQUEST_sum / $denom)) + units: (Req + $normUnit) + Hit Ratio: + avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) + if (TCP_UTCL1_REQUEST_sum != 0) else None)) + min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) + if (TCP_UTCL1_REQUEST_sum != 0) else None)) + max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) + if (TCP_UTCL1_REQUEST_sum != 0) else None)) + units: pct + Hits: + avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + units: (Req + $normUnit) + Translation Misses: + avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + units: (Req + $normUnit) + Permission Misses: + avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + units: (Req + $normUnit) + - metric_table: + id: 1606 + title: L1D Addr Translation Stalls + header: + metric: Metric + avg: Avg + min: Min + max: Max + units: Units + metric: {} diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1700_L2_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1700_L2_cache.yaml deleted file mode 100644 index 08a8a5d724..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1700_L2_cache.yaml +++ /dev/null @@ -1,388 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1700 - title: L2 Cache - data source: - - metric_table: - id: 1701 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - tips: Tips - metric: - Utilization: - value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD))) - unit: pct - tips: - Bandwidth: - value: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) - unit: pct - tips: - Hit Rate: - value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else 0)) - unit: pct - tips: - L2-Fabric Read BW: - value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) - * 64)) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - tips: - L2-Fabric Write and Atomic BW: - value: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - tips: - HBM Bandwidth: - value: $hbmBandwidth - unit: GB/s - tips: - - - metric_table: - id: 1702 - title: L2 - Fabric Transactions - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Read BW: - avg: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) - * 64)) / $denom)) - min: MIN((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) - * 64)) / $denom)) - max: MAX((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) - * 64)) / $denom)) - unit: (Bytes + $normUnit) - tips: - HBM Read Traffic: - avg: AVG((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) - unit: pct - tips: - Remote Read Traffic: - avg: AVG((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) - min: MIN((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) - max: MAX((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) - unit: pct - tips: - Uncached Read Traffic: - avg: AVG((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) - unit: pct - tips: - Write and Atomic BW: - avg: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) - * 32)) / $denom)) - min: MIN((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) - * 32)) / $denom)) - max: MAX((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) - * 32)) / $denom)) - unit: (Bytes + $normUnit) - tips: - HBM Write and Atomic Traffic: - avg: AVG((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) - unit: pct - tips: - Remote Write and Atomic Traffic: - avg: AVG((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) - min: MIN((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) - max: MAX((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) - unit: pct - tips: - Atomic Traffic: - avg: AVG((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) - unit: pct - tips: - Uncached Write and Atomic Traffic: - avg: AVG((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) - unit: pct - tips: - Read Latency: - avg: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != - 0) else None)) - min: MIN(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != - 0) else None)) - max: MAX(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != - 0) else None)) - unit: Cycles - tips: - Write and Atomic Latency: - avg: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != - 0) else None)) - min: MIN(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != - 0) else None)) - max: MAX(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != - 0) else None)) - unit: Cycles - tips: - Atomic Latency: - avg: AVG(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum - != 0) else None)) - min: MIN(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum - != 0) else None)) - max: MAX(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum - != 0) else None)) - unit: Cycles - tips: - - - metric_table: - id: 1703 - title: L2 Cache Accesses - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Bandwidth: - avg: AVG((TCC_REQ_sum * 128) / $denom) - min: MIN((TCC_REQ_sum * 128) / $denom) - max: MAX((TCC_REQ_sum * 128) / $denom) - unit: (Bytes + $normUnit) - tips: - Req: - avg: AVG((TCC_REQ_sum / $denom)) - min: MIN((TCC_REQ_sum / $denom)) - max: MAX((TCC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - Read Req: - avg: AVG((TCC_READ_sum / $denom)) - min: MIN((TCC_READ_sum / $denom)) - max: MAX((TCC_READ_sum / $denom)) - unit: (Req + $normUnit) - tips: - Write Req: - avg: AVG((TCC_WRITE_sum / $denom)) - min: MIN((TCC_WRITE_sum / $denom)) - max: MAX((TCC_WRITE_sum / $denom)) - unit: (Req + $normUnit) - tips: - Atomic Req: - avg: AVG((TCC_ATOMIC_sum / $denom)) - min: MIN((TCC_ATOMIC_sum / $denom)) - max: MAX((TCC_ATOMIC_sum / $denom)) - unit: (Req + $normUnit) - tips: - Streaming Req: - avg: AVG((TCC_STREAMING_REQ_sum / $denom)) - min: MIN((TCC_STREAMING_REQ_sum / $denom)) - max: MAX((TCC_STREAMING_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - Probe Req: - avg: AVG((TCC_PROBE_sum / $denom)) - min: MIN((TCC_PROBE_sum / $denom)) - max: MAX((TCC_PROBE_sum / $denom)) - unit: (Req + $normUnit) - tips: - Cache Hit: - avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - unit: pct - tips: - Hits: - avg: AVG((TCC_HIT_sum / $denom)) - min: MIN((TCC_HIT_sum / $denom)) - max: MAX((TCC_HIT_sum / $denom)) - unit: (Hits + $normUnit) - tips: - Misses: - avg: AVG((TCC_MISS_sum / $denom)) - min: MIN((TCC_MISS_sum / $denom)) - max: MAX((TCC_MISS_sum / $denom)) - unit: (Misses + $normUnit) - tips: - Writeback: - avg: AVG((TCC_WRITEBACK_sum / $denom)) - min: MIN((TCC_WRITEBACK_sum / $denom)) - max: MAX((TCC_WRITEBACK_sum / $denom)) - unit: (Cachelines + $normUnit) - tips: - Writeback (Internal): - avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom)) - min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom)) - max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom)) - unit: (Cachelines + $normUnit) - tips: - Writeback (vL1D Req): - avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - unit: (Cachelines + $normUnit) - tips: - Evict (Internal): - avg: AVG((TCC_NORMAL_EVICT_sum / $denom)) - min: MIN((TCC_NORMAL_EVICT_sum / $denom)) - max: MAX((TCC_NORMAL_EVICT_sum / $denom)) - unit: (Cachelines + $normUnit) - tips: - Evict (vL1D Req): - avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - unit: (Cachelines + $normUnit) - tips: - NC Req: - avg: AVG((TCC_NC_REQ_sum / $denom)) - min: MIN((TCC_NC_REQ_sum / $denom)) - max: MAX((TCC_NC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - UC Req: - avg: AVG((TCC_UC_REQ_sum / $denom)) - min: MIN((TCC_UC_REQ_sum / $denom)) - max: MAX((TCC_UC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - CC Req: - avg: AVG((TCC_CC_REQ_sum / $denom)) - min: MIN((TCC_CC_REQ_sum / $denom)) - max: MAX((TCC_CC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - RW Req: - avg: AVG((TCC_RW_REQ_sum / $denom)) - min: MIN((TCC_RW_REQ_sum / $denom)) - max: MAX((TCC_RW_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - - - metric_table: - id: 1704 - title: L2 Cache Stalls - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - - - metric_table: - id: 1705 - title: L2 - Fabric Interface Stalls - header: - metric: Metric - type: Type - transaction: Transaction - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - style: - type: simple_multi_bar - metric: - Write - Credit Starvation: - type: Credit Starvation - transaction: Write - avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - unit: pct - tips: - - - metric_table: - id: 1706 - title: L2 - Fabric Detailed Transaction Breakdown - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Read (32B): - avg: AVG((TCC_EA_RDREQ_32B_sum / $denom)) - min: MIN((TCC_EA_RDREQ_32B_sum / $denom)) - max: MAX((TCC_EA_RDREQ_32B_sum / $denom)) - unit: (Req + $normUnit) - tips: - Read (64B): - avg: AVG(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) - min: MIN(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) - max: MAX(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) - unit: (Req + $normUnit) - tips: - Read (Uncached): - avg: AVG((TCC_EA_RD_UNCACHED_32B_sum / $denom)) - min: MIN((TCC_EA_RD_UNCACHED_32B_sum / $denom)) - max: MAX((TCC_EA_RD_UNCACHED_32B_sum / $denom)) - unit: (Req + $normUnit) - tips: - HBM Read: - avg: AVG((TCC_EA_RDREQ_DRAM_sum / $denom)) - min: MIN((TCC_EA_RDREQ_DRAM_sum / $denom)) - max: MAX((TCC_EA_RDREQ_DRAM_sum / $denom)) - unit: (Req + $normUnit) - tips: - Remote Read: - avg: AVG((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom)) - unit: (Req + $normUnit) - tips: - Write and Atomic (32B): - avg: AVG(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) - min: MIN(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) - max: MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) - unit: (Req + $normUnit) - tips: - Write and Atomic (Uncached): - avg: AVG((TCC_EA_WR_UNCACHED_32B_sum / $denom)) - min: MIN((TCC_EA_WR_UNCACHED_32B_sum / $denom)) - max: MAX((TCC_EA_WR_UNCACHED_32B_sum / $denom)) - unit: (Req + $normUnit) - tips: - Write and Atomic (64B): - avg: AVG((TCC_EA_WRREQ_64B_sum / $denom)) - min: MIN((TCC_EA_WRREQ_64B_sum / $denom)) - max: MAX((TCC_EA_WRREQ_64B_sum / $denom)) - unit: (Req + $normUnit) - tips: - HBM Write and Atomic: - avg: AVG((TCC_EA_WRREQ_DRAM_sum / $denom)) - min: MIN((TCC_EA_WRREQ_DRAM_sum / $denom)) - max: MAX((TCC_EA_WRREQ_DRAM_sum / $denom)) - unit: (Req + $normUnit) - tips: - Remote Write and Atomic: - avg: AVG((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom)) - unit: (Req + $normUnit) - tips: - Atomic: - avg: AVG((TCC_EA_ATOMIC_sum / $denom)) - min: MIN((TCC_EA_ATOMIC_sum / $denom)) - max: MAX((TCC_EA_ATOMIC_sum / $denom)) - unit: (Req + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1700_l2_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1700_l2_cache.yaml new file mode 100644 index 0000000000..f3ecdc468c --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1700_l2_cache.yaml @@ -0,0 +1,536 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1700 + title: L2 Cache + metrics_description: + Utilization: The ratio of the number of cycles an L2 channel was active, summed + over all L2 channels on the accelerator over the total L2 cycles. + Peak Bandwidth: The number of bytes looked up in the L2 cache, as a percent of + the peak theoretical bandwidth achievable on the specific accelerator. The number + of bytes is calculated as the number of cache lines requested multiplied by + the cache line size. This value does not consider partial requests, so e.g., + if only a single value is requested in a cache line, the data movement will + still be counted as a full cache line. + Hit Rate: The ratio of the number of L2 cache line requests that hit in the L2 + cache over the total number of incoming cache line requests to the L2 cache. + L2-Fabric Read BW: The number of bytes read by the L2 over the Infinity Fabric + interface per unit time. + L2-Fabric Write and Atomic BW: The number of bytes sent by the L2 over the Infinity + Fabric interface by write and atomic operations per unit time. + HBM Bandwidth: Maximum theoretical bandwidth of the accelerator's local high-bandwidth + memory (HBM) per unit time. This value is calculated as the number of HBM channels + multiplied by the HBM channel width multiplied by the HBM clock frequency. + Read BW: The total number of bytes read by the L2 cache from Infinity Fabric per + normalization unit. + HBM Read Traffic: The percent of read requests generated by the L2 cache that + are routed to the accelerator's local high-bandwidth memory (HBM). This breakdown + does not consider the size of the request (meaning that 32B and 64B requests + are both counted as a single request), so this metric only approximates the + percent of the L2-Fabric Read bandwidth directed to the local HBM. + Remote Read Traffic: The percent of read requests generated by the L2 cache that + are routed to any memory location other than the accelerator's local high-bandwidth + memory (HBM) - for example, the CPU's DRAM or a remote accelerator's HBM. This + breakdown does not consider the size of the request (meaning that 32B and 64B + requests are both counted as a single request), so this metric only approximates + the percent of the L2-Fabric Read bandwidth directed to a remote location. + Uncached Read Traffic: The percent of read requests generated by the L2 cache + that are reading from an uncached memory allocation. Note, as described in the + request flow section, a single 64B read request is typically counted as two + uncached read requests. So, it is possible for the Uncached Read Traffic to + reach up to 200% of the total number of read requests. This breakdown does not + consider the size of the request (i.e., 32B and 64B requests are both counted + as a single request), so this metric only approximates the percent of the L2-Fabric + read bandwidth directed to an uncached memory location. + Write and Atomic BW: The total number of bytes written by the L2 over Infinity + Fabric by write and atomic operations per normalization unit. Note that on current + CDNA accelerators, such as the MI2XX, requests are only considered atomic by + Infinity Fabric if they are targeted at non-write-cacheable memory, for example, + fine-grained memory allocations or uncached memory allocations on the MI2XX. + HBM Write and Atomic Traffic: The percent of write and atomic requests generated + by the L2 cache that are routed to the accelerator's local high-bandwidth memory + (HBM). This breakdown does not consider the size of the request (meaning that + 32B and 64B requests are both counted as a single request), so this metric only + approximates the percent of the L2-Fabric Write and Atomic bandwidth directed + to the local HBM. Note that on current CDNA accelerators, such as the MI2XX, + requests are only considered atomic by Infinity Fabric if they are targeted + at fine-grained memory allocations or uncached memory allocations. + Remote Write and Atomic Traffic: The percent of read requests generated by the + L2 cache that are routed to any memory location other than the accelerator's + local high-bandwidth memory (HBM) - for example, the CPU's DRAM or a remote + accelerator's HBM. This breakdown does not consider the size of the request + (meaning that 32B and 64B requests are both counted as a single request), so + this metric only approximates the percent of the L2-Fabric Read bandwidth directed + to a remote location. Note that on current CDNA accelerators, such as the MI2XX, + requests are only considered atomic by Infinity Fabric if they are targeted + at fine-grained memory allocations or uncached memory allocations. + Atomic Traffic: The percent of write requests generated by the L2 cache that are + atomic requests to any memory location. This breakdown does not consider the + size of the request (meaning that 32B and 64B requests are both counted as a + single request), so this metric only approximates the percent of the L2-Fabric + Read bandwidth directed to a remote location. Note that on current CDNA accelerators, + such as the MI2XX, requests are only considered atomic by Infinity Fabric if + they are targeted at fine-grained memory allocations or uncached memory allocations. + Uncached Write and Atomic Traffic: The percent of write and atomic requests generated + by the L2 cache that are targeting uncached memory allocations. This breakdown + does not consider the size of the request (meaning that 32B and 64B requests + are both counted as a single request), so this metric only approximates the + percent of the L2-Fabric read bandwidth directed to uncached memory allocations. + Read Latency: The time-averaged number of cycles read requests spent in Infinity + Fabric before data was returned to the L2. + Write and Atomic Latency: The time-averaged number of cycles write requests spent + in Infinity Fabric before a completion acknowledgement was returned to the L2. + Atomic Latency: The time-averaged number of cycles atomic requests spent in Infinity + Fabric before a completion acknowledgement (atomic without return value) or + data (atomic with return value) was returned to the L2. + Bandwidth: The number of bytes looked up in the L2 cache, per normalization unit. + The number of bytes is calculated as the number of cache lines requested multiplied + by the cache line size. This value does not consider partial requests, so for + example, if only a single value is requested in a cache line, the data movement + will still be counted as a full cache line. + Req: The total number of incoming requests to the L2 from all clients for all + request types, per normalization unit. + Read Req: The total number of read requests to the L2 from all clients. + Write Req: The total number of write requests to the L2 from all clients. + Atomic Req: The total number of atomic requests (with and without return) to the + L2 from all clients. + Streaming Req: The total number of incoming requests to the L2 that are marked + as streaming. The exact meaning of this may differ depending on the targeted + accelerator, however on an MI2XX this corresponds to non-temporal load or stores. + The L2 cache attempts to evict streaming requests before normal requests when + the L2 is at capacity. + Probe Req: The number of coherence probe requests made to the L2 cache from outside + the accelerator. On an MI2XX, probe requests may be generated by, for example, + writes to fine-grained device memory or by writes to coarse-grained device memory. + Cache Hit: The ratio of the number of L2 cache line requests that hit in the L2 + cache over the total number of incoming cache line requests to the L2 cache. + Hits: The total number of requests to the L2 from all clients that hit in the + cache. As noted in the Speed-of-Light section, this includes hit-on-miss requests. + Misses: The total number of requests to the L2 from all clients that miss in the + cache. As noted in the Speed-of-Light section, these do not include hit-on-miss + requests. + Writeback: The total number of L2 cache lines written back to memory for any reason. + Write-backs may occur due to user code (such as HIP kernel calls to _threadfence_system + or atomic built-ins) by the command processor's memory acquire/release fences, + or for other internal hardware reasons. + Writeback (Internal): The total number of L2 cache lines written back to memory + for internal hardware reasons, per normalization unit. + Writeback (vL1D Req): The total number of L2 cache lines written back to memory + due to requests initiated by the vL1D cache, per normalization unit. + Evict (Internal): The total number of L2 cache lines evicted from the cache due + to capacity limits, per normalization unit. + Evict (vL1D Req): The total number of L2 cache lines evicted from the cache due + to invalidation requests initiated by the vL1D cache, per normalization unit. + NC Req: The total number of requests to the L2 to Not-hardware-Coherent (NC) memory + allocations, per normalization unit. + UC Req: The total number of requests to the L2 that go to Uncached (UC) memory + allocations. + CC Req: The total number of requests to the L2 that go to Coherently Cacheable + (CC) memory allocations. + RW Req: The total number of requests to the L2 that go to Read-Write coherent + memory (RW) allocations. + Write - Credit Starvation: The number of cycles the L2-Fabric interface was stalled + on write or atomic requests to any memory location because too many write/atomic + requests were currently in flight, as a percent of the total active L2 cycles. + Read (32B): The total number of L2 requests to Infinity Fabric to read 32B of + data from any memory location, per normalization unit. + Read (64B): The total number of L2 requests to Infinity Fabric to read 64B of + data from any memory location, per normalization unit. + Read (Uncached): The total number of L2 requests to Infinity Fabric to read uncached + data from any memory location, per normalization unit. 64B requests for uncached + data are counted as two 32B uncached data requests. + HBM Read: The total number of L2 requests to Infinity Fabric to read 32B or 64B + of data from the accelerator's local HBM, per normalization unit. + Remote Read: The total number of L2 requests to Infinity Fabric to read 32B or + 64B of data from any source other than the accelerator's local HBM, per normalization + unit. + Write and Atomic (32B): The total number of L2 requests to Infinity Fabric to + write or atomically update 32B of data to any memory location, per normalization + unit. + Write and Atomic (Uncached): The total number of L2 requests to Infinity Fabric + to write or atomically update 32B or 64B of uncached data, per normalization + unit. + Write and Atomic (64B): The total number of L2 requests to Infinity Fabric to + write or atomically update 64B of data in any memory location, per normalization + unit. + HBM Write and Atomic: The total number of L2 requests to Infinity Fabric to write + or atomically update 32B or 64B of data in the accelerator's local HBM, per + normalization unit. + Remote Write and Atomic: The total number of L2 requests to Infinity Fabric to + write or atomically update 32B or 64B of data in any memory location other than + the accelerator's local HBM, per normalization unit. + Atomic: The total number of L2 requests to Infinity Fabric to atomically update + 32B or 64B of data in any memory location, per normalization unit. See Request + flow for more detail. Note that on current CDNA accelerators, such as the MI2XX, + requests are only considered atomic by Infinity Fabric if they are targeted + at non-write-cacheable memory, such as fine-grained memory allocations or uncached + memory allocations on the MI2XX. + Read Stall: "The ratio of the total number of cycles the L2-Fabric interface was\ + \ stalled on a read request to any destination (local HBM, remote PCIe\xAE connected\ + \ accelerator or CPU, or remote Infinity Fabric connected accelerator or CPU)\ + \ over the total active L2 cycles." + Write Stall: The ratio of the total number of cycles the L2-Fabric interface was + stalled on a write or atomic request to any destination (local HBM, remote accelerator + or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected + accelerator or CPU) over the total active L2 cycles. + Read - PCIe Stall: The number of cycles the L2-Fabric interface was stalled on + read requests to remote PCIe connected accelerators or CPUs as a percent of + the total active L2 cycles. + Read - Infinity Fabric Stall: The number of cycles the L2-Fabric interface was + stalled on read requests to remote Infinity Fabric connected accelerators or + CPUs as a percent of the total active L2 cycles. + Read - HBM Stall: The number of cycles the L2-Fabric interface was stalled on + read requests to the accelerator's local HBM as a percent of the total active + L2 cycles. + Write - PCIe Stall: The number of cycles the L2-Fabric interface was stalled on + write or atomic requests to remote PCIe connected accelerators or CPUs as a + percent of the total active L2 cycles. + Write - Infinity Fabric Stall: The number of cycles the L2-Fabric interface was + stalled on write or atomic requests to remote Infinity Fabric connected accelerators + or CPUs as a percent of the total active L2 cycles. + Write - HBM Stall: The number of cycles the L2-Fabric interface was stalled on + write or atomic requests to accelerator's local HBM as a percent of the total + active L2 cycles. + data source: + - metric_table: + id: 1701 + title: L2 Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + metric: + Utilization: + value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD))) + unit: pct + Peak Bandwidth: + value: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) + / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) + unit: pct + Hit Rate: + value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else 0)) + unit: pct + L2-Fabric Read BW: + value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) + * 64)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + L2-Fabric Write and Atomic BW: + value: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) + * 32)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + HBM Bandwidth: + value: $hbmBandwidth + unit: GB/s + - metric_table: + id: 1702 + title: L2-Fabric interface metrics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Read BW: + avg: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) + * 64)) / $denom)) + min: MIN((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) + * 64)) / $denom)) + max: MAX((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) + * 64)) / $denom)) + unit: (Bytes + $normUnit) + HBM Read Traffic: + avg: AVG((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + unit: pct + Remote Read Traffic: + avg: AVG((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) + if (TCC_EA_RDREQ_sum != 0) else None)) + min: MIN((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) + if (TCC_EA_RDREQ_sum != 0) else None)) + max: MAX((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) + if (TCC_EA_RDREQ_sum != 0) else None)) + unit: pct + Uncached Read Traffic: + avg: AVG((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + unit: pct + Write and Atomic BW: + avg: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) + * 32)) / $denom)) + min: MIN((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) + * 32)) / $denom)) + max: MAX((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) + * 32)) / $denom)) + unit: (Bytes + $normUnit) + HBM Write and Atomic Traffic: + avg: AVG((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + unit: pct + Remote Write and Atomic Traffic: + avg: AVG((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) + if (TCC_EA_WRREQ_sum != 0) else None)) + min: MIN((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) + if (TCC_EA_WRREQ_sum != 0) else None)) + max: MAX((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) + if (TCC_EA_WRREQ_sum != 0) else None)) + unit: pct + Atomic Traffic: + avg: AVG((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + unit: pct + Uncached Write and Atomic Traffic: + avg: AVG((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + unit: pct + Read Latency: + avg: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + min: MIN(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + max: MAX(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + unit: Cycles + Write and Atomic Latency: + avg: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + min: MIN(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + max: MAX(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + unit: Cycles + Atomic Latency: + avg: AVG(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum + != 0) else None)) + min: MIN(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum + != 0) else None)) + max: MAX(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum + != 0) else None)) + unit: Cycles + - metric_table: + id: 1703 + title: L2 Cache Accesses + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Bandwidth: + avg: AVG((TCC_REQ_sum * 128) / $denom) + min: MIN((TCC_REQ_sum * 128) / $denom) + max: MAX((TCC_REQ_sum * 128) / $denom) + unit: (Bytes + $normUnit) + Req: + avg: AVG((TCC_REQ_sum / $denom)) + min: MIN((TCC_REQ_sum / $denom)) + max: MAX((TCC_REQ_sum / $denom)) + unit: (Req + $normUnit) + Read Req: + avg: AVG((TCC_READ_sum / $denom)) + min: MIN((TCC_READ_sum / $denom)) + max: MAX((TCC_READ_sum / $denom)) + unit: (Req + $normUnit) + Write Req: + avg: AVG((TCC_WRITE_sum / $denom)) + min: MIN((TCC_WRITE_sum / $denom)) + max: MAX((TCC_WRITE_sum / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG((TCC_ATOMIC_sum / $denom)) + min: MIN((TCC_ATOMIC_sum / $denom)) + max: MAX((TCC_ATOMIC_sum / $denom)) + unit: (Req + $normUnit) + Streaming Req: + avg: AVG((TCC_STREAMING_REQ_sum / $denom)) + min: MIN((TCC_STREAMING_REQ_sum / $denom)) + max: MAX((TCC_STREAMING_REQ_sum / $denom)) + unit: (Req + $normUnit) + Probe Req: + avg: AVG((TCC_PROBE_sum / $denom)) + min: MIN((TCC_PROBE_sum / $denom)) + max: MAX((TCC_PROBE_sum / $denom)) + unit: (Req + $normUnit) + Cache Hit: + avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + unit: pct + Hits: + avg: AVG((TCC_HIT_sum / $denom)) + min: MIN((TCC_HIT_sum / $denom)) + max: MAX((TCC_HIT_sum / $denom)) + unit: (Hits + $normUnit) + Misses: + avg: AVG((TCC_MISS_sum / $denom)) + min: MIN((TCC_MISS_sum / $denom)) + max: MAX((TCC_MISS_sum / $denom)) + unit: (Misses + $normUnit) + Writeback: + avg: AVG((TCC_WRITEBACK_sum / $denom)) + min: MIN((TCC_WRITEBACK_sum / $denom)) + max: MAX((TCC_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + Writeback (Internal): + avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom)) + min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom)) + max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + Writeback (vL1D Req): + avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + Evict (Internal): + avg: AVG((TCC_NORMAL_EVICT_sum / $denom)) + min: MIN((TCC_NORMAL_EVICT_sum / $denom)) + max: MAX((TCC_NORMAL_EVICT_sum / $denom)) + unit: (Cachelines + $normUnit) + Evict (vL1D Req): + avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + unit: (Cachelines + $normUnit) + NC Req: + avg: AVG((TCC_NC_REQ_sum / $denom)) + min: MIN((TCC_NC_REQ_sum / $denom)) + max: MAX((TCC_NC_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC Req: + avg: AVG((TCC_UC_REQ_sum / $denom)) + min: MIN((TCC_UC_REQ_sum / $denom)) + max: MAX((TCC_UC_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC Req: + avg: AVG((TCC_CC_REQ_sum / $denom)) + min: MIN((TCC_CC_REQ_sum / $denom)) + max: MAX((TCC_CC_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW Req: + avg: AVG((TCC_RW_REQ_sum / $denom)) + min: MIN((TCC_RW_REQ_sum / $denom)) + max: MAX((TCC_RW_REQ_sum / $denom)) + unit: (Req + $normUnit) + - metric_table: + id: 1704 + title: L2 Cache Stalls + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: {} + - metric_table: + id: 1705 + title: L2 - Fabric Interface stalls + header: + metric: Metric + type: Type + transaction: Transaction + avg: Avg + min: Min + max: Max + unit: Unit + style: + type: simple_multi_bar + metric: + Write - Credit Starvation: + type: Credit Starvation + transaction: Write + avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum + != 0) else None)) + min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum + != 0) else None)) + max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum + != 0) else None)) + unit: pct + - metric_table: + id: 1706 + title: L2 - Fabric interface detailed metrics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Read (32B): + avg: AVG((TCC_EA_RDREQ_32B_sum / $denom)) + min: MIN((TCC_EA_RDREQ_32B_sum / $denom)) + max: MAX((TCC_EA_RDREQ_32B_sum / $denom)) + unit: (Req + $normUnit) + Read (64B): + avg: AVG(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) + min: MIN(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) + max: MAX(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) + unit: (Req + $normUnit) + Read (Uncached): + avg: AVG((TCC_EA_RD_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA_RD_UNCACHED_32B_sum / $denom)) + max: MAX((TCC_EA_RD_UNCACHED_32B_sum / $denom)) + unit: (Req + $normUnit) + HBM Read: + avg: AVG((TCC_EA_RDREQ_DRAM_sum / $denom)) + min: MIN((TCC_EA_RDREQ_DRAM_sum / $denom)) + max: MAX((TCC_EA_RDREQ_DRAM_sum / $denom)) + unit: (Req + $normUnit) + Remote Read: + avg: AVG((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom)) + min: MIN((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom)) + max: MAX((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom)) + unit: (Req + $normUnit) + Write and Atomic (32B): + avg: AVG(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) + min: MIN(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) + max: MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) + unit: (Req + $normUnit) + Write and Atomic (Uncached): + avg: AVG((TCC_EA_WR_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA_WR_UNCACHED_32B_sum / $denom)) + max: MAX((TCC_EA_WR_UNCACHED_32B_sum / $denom)) + unit: (Req + $normUnit) + Write and Atomic (64B): + avg: AVG((TCC_EA_WRREQ_64B_sum / $denom)) + min: MIN((TCC_EA_WRREQ_64B_sum / $denom)) + max: MAX((TCC_EA_WRREQ_64B_sum / $denom)) + unit: (Req + $normUnit) + HBM Write and Atomic: + avg: AVG((TCC_EA_WRREQ_DRAM_sum / $denom)) + min: MIN((TCC_EA_WRREQ_DRAM_sum / $denom)) + max: MAX((TCC_EA_WRREQ_DRAM_sum / $denom)) + unit: (Req + $normUnit) + Remote Write and Atomic: + avg: AVG((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom)) + min: MIN((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom)) + max: MAX((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom)) + unit: (Req + $normUnit) + Atomic: + avg: AVG((TCC_EA_ATOMIC_sum / $denom)) + min: MIN((TCC_EA_ATOMIC_sum / $denom)) + max: MAX((TCC_EA_ATOMIC_sum / $denom)) + unit: (Req + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1800_L2_cache_per_channel.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1800_L2_cache_per_channel.yaml deleted file mode 100644 index a787f360cf..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1800_L2_cache_per_channel.yaml +++ /dev/null @@ -1,350 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1800 - title: L2 Cache (per Channel) - data source: - - metric_table: - id: 1801 - title: Aggregate Stats (All channels) - header: - metric: Metric - avg: Avg - std dev: Std Dev - min: Min - max: Max - unit: Unit - tips: Tips - metric: - L2 Cache Hit Rate: - avg: AVG(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) - + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * - TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) - + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 - * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) - + (100 * TCC_HIT[16])) + (100 * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 - * TCC_HIT[19])) + (100 * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) - + (100 * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + (100 - * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + (100 * TCC_HIT[29])) - + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) - + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] - + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) - + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] - + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) - + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] - + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) - + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] - + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) - + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] - + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) - + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[29] + TCC_HIT[29])) + (TCC_MISS[30] - + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None)) - std dev: STD(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) - + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * - TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) - + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 - * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) - + (100 * TCC_HIT[16])) + (100 * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 - * TCC_HIT[19])) + (100 * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) - + (100 * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + (100 - * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + (100 * TCC_HIT[29])) - + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) - + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] - + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) - + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] - + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) - + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] - + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) - + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] - + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) - + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] - + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) - + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] - + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None)) - min: MIN(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) - + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * - TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) - + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 - * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) - + (100 * TCC_HIT[16])) + (100 * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 - * TCC_HIT[19])) + (100 * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) - + (100 * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + (100 - * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + (100 * TCC_HIT[29])) - + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) - + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] - + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) - + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] - + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) - + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] - + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) - + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] - + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) - + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] - + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) - + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] - + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None)) - max: MAX(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) - + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * - TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) - + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 - * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) - + (100 * TCC_HIT[16])) + (100 * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 - * TCC_HIT[19])) + (100 * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) - + (100 * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + (100 - * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + (100 * TCC_HIT[29])) - + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) - + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] - + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) - + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] - + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) - + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] - + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) - + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] - + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) - + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] - + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) - + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] - + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None)) - unit: pct - tips: - # FIXME: other arggr metrics!! - - - metric_table: - id: 1802 - title: L2 Cache Hit Rate (pct) - header: - metric: Channel - expr: Expression - metric: - "::_1": - expr: - (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1] - + TCC_MISS[::_1]) != 0) else None) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_box - - - metric_table: - id: 1803 - title: L2 Requests (per normUnit) - header: - metric: Channel - expr: Expression - metric: - "::_1": - expr: (TO_INT(TCC_REQ[::_1]) / $denom) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_box - - - metric_table: - id: 1804 - title: L2 Requests (per normUnit) - header: - metric: Channel - read req: L2 Read - write req: L2 Write - atomic req: L2 Atomic - metric: - "::_1": - read req: AVG((TO_INT(TCC_READ[::_1]) / $denom)) - write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom)) - atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom)) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_multiple_bar - - - metric_table: - id: 1805 - title: L2-Fabric Requests (per normUnit) - header: - metric: Channel - read req: L2-Fabric Read - write req: L2-Fabric Write and Atomic - atomic req: L2-Fabric Atomic - metric: - "::_1": - read req: AVG((TO_INT(TCC_EA_RDREQ[::_1]) / $denom)) - write req: AVG((TO_INT(TCC_EA_WRREQ[::_1]) / $denom)) - atomic req: AVG((TO_INT(TCC_EA_ATOMIC[::_1]) / $denom)) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_multiple_bar - - # - metric_table: - # id: 1806 - # title: L2-EA Latency (Cycles) - # header: - # metric: Metric - # read lat: L2-EA Read - # write lat: L2-EA Write - # atomic lat: L2-EA Atomic - # metric: - # "::_1": - # read lat: - # AVG(((TCC_EA_RDREQ_LEVEL[::_1] / TCC_EA_RDREQ[::_1]) if (TCC_EA_RDREQ[::_1] - # != 0) else None)) - # write lat: - # AVG(((TCC_EA_WRREQ_LEVEL[::_1] / TCC_EA_WRREQ[::_1]) if (TCC_EA_WRREQ[::_1] - # != 0) else None)) - # atomic lat: - # AVG(((TCC_EA_ATOMIC_LEVEL[::_1] / TCC_EA_ATOMIC[::_1]) if - # (TCC_EA_ATOMIC[::_1] != 0) else 0)) - # placeholder_range: - # "::_1": 32 - # cli_style: simple_multiple_bar - - - metric_table: - id: 1806 - title: L2-Fabric Read Latency (Cycles) - header: - metric: Channel - expr: Expression - metric: - "::_1": - expr: - ((TCC_EA_RDREQ_LEVEL[::_1] / TCC_EA_RDREQ[::_1]) if (TCC_EA_RDREQ[::_1] - != 0) else None) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_box - - - metric_table: - id: 1807 - title: L2-Fabric Write and Atomic Latency (Cycles) - header: - metric: Channel - expr: Expression - metric: - "::_1": - expr: - ((TCC_EA_WRREQ_LEVEL[::_1] / TCC_EA_WRREQ[::_1]) if (TCC_EA_WRREQ[::_1] - != 0) else None) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_box - - - metric_table: - id: 1808 - title: L2-Fabric Atomic Latency (Cycles) - header: - metric: Channel - expr: Expression - metric: - "::_1": - expr: ((TCC_EA_ATOMIC_LEVEL[::_1] / TCC_EA_ATOMIC[::_1]) if - (TCC_EA_ATOMIC[::_1] != 0) else 0) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_box - - - metric_table: - id: 1809 - title: L2-Fabric Read Stall (Cycles per normUnit) - header: - metric: Channel - ea read stall - pcie: L2-Fabric Read Stall (PCIe) - ea read stall - if: L2-Fabric Read Stall (Infinity Fabric™) - ea read stall - hbm: L2-Fabric Read Stall (HBM) - metric: - "::_1": - ea read stall - pcie: None # Missing perfmon - ea read stall - if: None # Missing perfmon - ea read stall - hbm: None # Missing perfmon - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_multiple_bar - - - metric_table: - id: 1810 - title: L2-Fabric Write and Atomic Stall (Cycles per normUnit) - header: - metric: Channel - ea write stall - pcie: L2-Fabric Write Stall (PCIe) - ea write stall - if: L2-Fabric Write Stall (Infinity Fabric™) - ea write stall - hbm: L2-Fabric Write Stall (HBM) - ea write stall - starve: L2-Fabric Write Starve - metric: - "::_1": - ea write stall - pcie: None # Missing perfmon - ea write stall - if: None # Missing perfmon - ea write stall - hbm: None # Missing perfmon - ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1]) / $denom)) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_multiple_bar - - - metric_table: - id: 1812 - title: L2-Fabric (128B read requests per normUnit) - header: - metric: Channel - expr: Expression - metric: - "::_1": - expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom) - placeholder_range: - "::_1": $total_l2_chan - # tips: Number of 128-byte read requests sent to EA - cli_style: simple_box - tui_style: simple_box diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1800_l2_cache_per_channel.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1800_l2_cache_per_channel.yaml new file mode 100644 index 0000000000..f097a14b55 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1800_l2_cache_per_channel.yaml @@ -0,0 +1,323 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1800 + title: L2 Cache (per Channel) + metrics_description: + L2 Cache Hit Rate: The percent of total number of requests to the L2 from all + clients that hit in the cache. As noted in the Speed-of-Light section, this + includes hit-on-miss requests. + data source: + - metric_table: + id: 1801 + title: Aggregate Stats (All channels) + header: + metric: Metric + avg: Avg + std dev: Std Dev + min: Min + max: Max + unit: Unit + metric: + L2 Cache Hit Rate: + avg: AVG(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 + * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 + * TCC_HIT[15])) + (100 * TCC_HIT[16])) + (100 * TCC_HIT[17])) + (100 * + TCC_HIT[18])) + (100 * TCC_HIT[19])) + (100 * TCC_HIT[20])) + (100 * TCC_HIT[21])) + + (100 * TCC_HIT[22])) + (100 * TCC_HIT[23])) + (100 * TCC_HIT[24])) + + (100 * TCC_HIT[25])) + (100 * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 + * TCC_HIT[28])) + (100 * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * + TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) + + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + (TCC_MISS[18] + + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + TCC_HIT[20])) + + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + (TCC_MISS[23] + + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + TCC_HIT[25])) + + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + (TCC_MISS[28] + + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + TCC_HIT[30])) + + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[29] + TCC_HIT[29])) + (TCC_MISS[30] + + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None)) + std dev: STD(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 + * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 + * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) + (100 * TCC_HIT[17])) + (100 + * TCC_HIT[18])) + (100 * TCC_HIT[19])) + (100 * TCC_HIT[20])) + (100 * + TCC_HIT[21])) + (100 * TCC_HIT[22])) + (100 * TCC_HIT[23])) + (100 * TCC_HIT[24])) + + (100 * TCC_HIT[25])) + (100 * TCC_HIT[26])) + (100 * TCC_HIT[27])) + + (100 * TCC_HIT[28])) + (100 * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 + * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) + + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + (TCC_MISS[18] + + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + TCC_HIT[20])) + + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + (TCC_MISS[23] + + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + TCC_HIT[25])) + + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + (TCC_MISS[28] + + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + TCC_HIT[30])) + + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None)) + min: MIN(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 + * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 + * TCC_HIT[15])) + (100 * TCC_HIT[16])) + (100 * TCC_HIT[17])) + (100 * + TCC_HIT[18])) + (100 * TCC_HIT[19])) + (100 * TCC_HIT[20])) + (100 * TCC_HIT[21])) + + (100 * TCC_HIT[22])) + (100 * TCC_HIT[23])) + (100 * TCC_HIT[24])) + + (100 * TCC_HIT[25])) + (100 * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 + * TCC_HIT[28])) + (100 * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * + TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) + + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + (TCC_MISS[18] + + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + TCC_HIT[20])) + + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + (TCC_MISS[23] + + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + TCC_HIT[25])) + + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + (TCC_MISS[28] + + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + TCC_HIT[30])) + + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None)) + max: MAX(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 + * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 + * TCC_HIT[15])) + (100 * TCC_HIT[16])) + (100 * TCC_HIT[17])) + (100 * + TCC_HIT[18])) + (100 * TCC_HIT[19])) + (100 * TCC_HIT[20])) + (100 * TCC_HIT[21])) + + (100 * TCC_HIT[22])) + (100 * TCC_HIT[23])) + (100 * TCC_HIT[24])) + + (100 * TCC_HIT[25])) + (100 * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 + * TCC_HIT[28])) + (100 * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * + TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) + + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + (TCC_MISS[18] + + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + TCC_HIT[20])) + + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + (TCC_MISS[23] + + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + TCC_HIT[25])) + + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + (TCC_MISS[28] + + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + TCC_HIT[30])) + + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None)) + unit: pct + - metric_table: + id: 1802 + title: L2 Cache Hit Rate (pct) + header: + metric: Channel + expr: Expression + metric: + ::_1: + expr: (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1] + + TCC_MISS[::_1]) != 0) else None) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1803 + title: L2 Requests (per normUnit) + header: + metric: Channel + expr: Expression + metric: + ::_1: + expr: (TO_INT(TCC_REQ[::_1]) / $denom) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1804 + title: L2 Requests (per normUnit) + header: + metric: Channel + read req: L2 Read + write req: L2 Write + atomic req: L2 Atomic + metric: + ::_1: + read req: AVG((TO_INT(TCC_READ[::_1]) / $denom)) + write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom)) + atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom)) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_multiple_bar + tui_style: simple_multiple_bar + - metric_table: + id: 1805 + title: L2-Fabric Requests (per normUnit) + header: + metric: Channel + read req: L2-Fabric Read + write req: L2-Fabric Write and Atomic + atomic req: L2-Fabric Atomic + metric: + ::_1: + read req: AVG((TO_INT(TCC_EA_RDREQ[::_1]) / $denom)) + write req: AVG((TO_INT(TCC_EA_WRREQ[::_1]) / $denom)) + atomic req: AVG((TO_INT(TCC_EA_ATOMIC[::_1]) / $denom)) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_multiple_bar + tui_style: simple_multiple_bar + - metric_table: + id: 1806 + title: L2-Fabric Read Latency (Cycles) + header: + metric: Channel + expr: Expression + metric: + ::_1: + expr: ((TCC_EA_RDREQ_LEVEL[::_1] / TCC_EA_RDREQ[::_1]) if (TCC_EA_RDREQ[::_1] + != 0) else None) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1807 + title: L2-Fabric Write and Atomic Latency (Cycles) + header: + metric: Channel + expr: Expression + metric: + ::_1: + expr: ((TCC_EA_WRREQ_LEVEL[::_1] / TCC_EA_WRREQ[::_1]) if (TCC_EA_WRREQ[::_1] + != 0) else None) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1808 + title: L2-Fabric Atomic Latency (Cycles) + header: + metric: Channel + expr: Expression + metric: + ::_1: + expr: ((TCC_EA_ATOMIC_LEVEL[::_1] / TCC_EA_ATOMIC[::_1]) if (TCC_EA_ATOMIC[::_1] + != 0) else 0) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1809 + title: L2-Fabric Read Stall (Cycles per normUnit) + header: + metric: Channel + ea read stall - pcie: L2-Fabric Read Stall (PCIe) + ea read stall - if: "L2-Fabric Read Stall (Infinity Fabric\u2122)" + ea read stall - hbm: L2-Fabric Read Stall (HBM) + metric: + ::_1: + ea read stall - pcie: None + ea read stall - if: None + ea read stall - hbm: None + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_multiple_bar + tui_style: simple_multiple_bar + - metric_table: + id: 1810 + title: L2-Fabric Write and Atomic Stall (Cycles per normUnit) + header: + metric: Channel + ea write stall - pcie: L2-Fabric Write Stall (PCIe) + ea write stall - if: "L2-Fabric Write Stall (Infinity Fabric\u2122)" + ea write stall - hbm: L2-Fabric Write Stall (HBM) + ea write stall - starve: L2-Fabric Write Starve + metric: + ::_1: + ea write stall - pcie: None + ea write stall - if: None + ea write stall - hbm: None + ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1]) + / $denom)) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_multiple_bar + tui_style: simple_multiple_bar + - metric_table: + id: 1812 + title: L2-Fabric (128B read requests per normUnit) + header: + metric: Channel + expr: Expression + metric: + ::_1: + expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/2100_pc_sampling.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/2100_pc_sampling.yaml index d6c4ff393d..e94471d7dc 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/2100_pc_sampling.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/2100_pc_sampling.yaml @@ -1,10 +1,11 @@ ---- +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py Panel Config: id: 2100 title: PC Sampling + metrics_description: {} data source: - - pc_sampling_table: - id: 2101 - title: PC Sampling - source: ps_file - comparable: false # enable it later + - pc_sampling_table: + id: 2101 + title: PC Sampling + source: ps_file + comparable: false diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0000_top_stats.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0000_top_stats.yaml index ccf1309850..55c6f6bb24 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0000_top_stats.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0000_top_stats.yaml @@ -1,14 +1,14 @@ ---- +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py Panel Config: - id: 000 + id: 0 title: Top Stats + metrics_description: {} data source: - - raw_csv_table: - id: 001 - title: Top Kernels - source: pmc_kernel_top.csv - - - raw_csv_table: - id: 002 - title: Dispatch List - source: pmc_dispatch_info.csv + - raw_csv_table: + id: 1 + title: Top Kernels + source: pmc_kernel_top.csv + - raw_csv_table: + id: 2 + title: Dispatch List + source: pmc_dispatch_info.csv diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0100_system_info.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0100_system_info.yaml index b7ec29eaf9..8470ffbbe3 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0100_system_info.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0100_system_info.yaml @@ -1,9 +1,10 @@ ---- +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py Panel Config: id: 100 title: System Info + metrics_description: {} data source: - - raw_csv_table: - id: 101 - source: sysinfo.csv - columnwise: True + - raw_csv_table: + id: 101 + source: sysinfo.csv + columnwise: true diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0200_system-speed-of-light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0200_system-speed-of-light.yaml deleted file mode 100644 index 68687f1c28..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0200_system-speed-of-light.yaml +++ /dev/null @@ -1,262 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - SALU: &SALU_anchor Scalar Arithmetic Logic Unit - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 200 - title: System Speed-of-Light - data source: - - metric_table: - id: 201 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - peak: Peak - pop: Pct of Peak - tips: Tips - metric: - VALU FLOPs: - value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) - + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) - + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + (64 * (((SQ_INSTS_VALU_ADD_F64 - + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (2 * SQ_INSTS_VALU_FMA_F64)))) - / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) / (((($max_sclk - * $cu_per_gpu) * 64) * 2) / 1000)) - tips: - VALU IOPs: - value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - Start_Timestamp))) - unit: GIOP/s - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) - tips: - MFMA FLOPs (F8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - tips: - MFMA FLOPs (BF16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) - tips: - MFMA FLOPs (F16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) - tips: - MFMA FLOPs (F32): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - tips: - MFMA FLOPs (F64): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - tips: - MFMA IOPs (Int8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GIOP/s - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - tips: - Active CUs: - value: $numActiveCUs - unit: CUs - peak: $cu_per_gpu - pop: ((100 * $numActiveCUs) / $cu_per_gpu) - tips: - SALU Utilization: - value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - peak: 100 - pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - tips: - VALU Utilization: - value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - peak: 100 - pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - tips: - MFMA Utilization: - value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu) - * 4))) - unit: pct - peak: 100 - pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu) - * 4))) - tips: - VMEM Utilization: - value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - peak: 100 - pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - tips: - Branch Utilization: - value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - peak: 100 - pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - tips: - VALU Active Threads: - value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - unit: Threads - peak: $wave_size - pop: (100 * AVG((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU / $wave_size) if (SQ_ACTIVE_INST_VALU != 0) else None)) - tips: - IPC: - value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - unit: Instr/cycle - peak: 5 - pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5) - tips: - Wavefront Occupancy: - value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - unit: Wavefronts - peak: ($max_waves_per_cu * $cu_per_gpu) - pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu - * $cu_per_gpu)))) - coll_level: SQ_LEVEL_WAVES - tips: - Theoretical LDS Bandwidth: - value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: (($max_sclk * $cu_per_gpu) * 0.128) - pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * 0.00128))) - tips: - LDS Bank Conflicts/Access: - value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - unit: Conflicts/access - peak: 32 - pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) / 32) - tips: - vL1D Cache Hit Rate: - value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - unit: pct - peak: 100 - pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / - TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - tips: - vL1D Cache BW: - value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu) - pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) - tips: - L2 Cache Hit Rate: - value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - unit: pct - peak: 100 - pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - tips: - L2 Cache BW: - value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)) - pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) - tips: - L2-Fabric Read BW: - value: AVG((128 * TCC_BUBBLE_sum + - 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + - 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)) - unit: GB/s - peak: $hbmBandwidth - pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + - 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + - 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth) - tips: - L2-Fabric Write BW: - value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: $hbmBandwidth - pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth) - tips: - L2-Fabric Read Latency: - value: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - unit: Cycles - peak: None - pop: None - tips: - L2-Fabric Write Latency: - value: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - unit: Cycles - peak: None - pop: None - tips: - sL1D Cache Hit Rate: - value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) - if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) - unit: pct - peak: 100 - pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) - if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) - tips: - sL1D Cache BW: - value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) - unit: GB/s - peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) - pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))) / ((($max_sclk - / 1000) * 64) * $sqc_per_gpu)) - tips: - L1I Hit Rate: - value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) - unit: pct - peak: 100 - pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) - tips: - L1I BW: - value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) - unit: GB/s - peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) - pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))) / ((($max_sclk - / 1000) * 64) * $sqc_per_gpu)) - tips: - L1I Fetch Latency: - value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - unit: Cycles - peak: None - pop: None - coll_level: SQ_IFETCH_LEVEL - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0200_system_speed_of_light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0200_system_speed_of_light.yaml new file mode 100644 index 0000000000..722866f6e0 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0200_system_speed_of_light.yaml @@ -0,0 +1,346 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 200 + title: System Speed-of-Light + metrics_description: + VALU FLOPs: 'The total floating-point operations executed per second on the VALU. + This is also presented as a percent of the peak theoretical FLOPs achievable + on the specific accelerator. Note: this does not include any floating-point + operations from MFMA instructions.' + VALU IOPs: 'The total integer operations executed per second on the VALU. This + is also presented as a percent of the peak theoretical IOPs achievable on the + specific accelerator. Note: this does not include any integer operations from + MFMA instructions.' + MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations + executed per second. This does not include any 16-bit brain floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F8 MFMA operations achievable on the specific accelerator. It is supported on + AMD Instinct MI300 series and later only. + MFMA FLOPs (BF16): 'The total number of 16-bit brain floating point MFMA operations + executed per second. Note: this does not include any 16-bit brain floating point + operations from VALU instructions. This is also presented as a percent of the + peak theoretical BF16 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F16): 'The total number of 16-bit floating point MFMA operations executed + per second. Note: this does not include any 16-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F16 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F32): 'The total number of 32-bit floating point MFMA operations executed + per second. Note: this does not include any 32-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F32 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F64): 'The total number of 64-bit floating point MFMA operations executed + per second. Note: this does not include any 64-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F64 MFMA operations achievable on the specific accelerator.' + MFMA IOPs (Int8): 'The total number of 8-bit integer MFMA operations executed + per second. Note: this does not include any 8-bit integer operations from VALU + instructions. This is also presented as a percent of the peak theoretical INT8 + MFMA operations achievable on the specific accelerator.' + Active CUs: Total number of active compute units (CUs) on the accelerator during + the kernel execution. + SALU Utilization: Indicates what percent of the kernel's duration the SALU was + busy executing instructions. Computed as the ratio of the total number of cycles + spent by the scheduler issuing SALU or SMEM instructions over the total CU cycles. + VALU Utilization: Indicates what percent of the kernel's duration the VALU was + busy executing instructions. Does not include VMEM operations. Computed as the + ratio of the total number of cycles spent by the scheduler issuing VALU instructions + over the total CU cycles. + MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit + was busy executing instructions. Computed as the ratio of the total number of + cycles the MFMA was busy over the total CU cycles. + VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit + was busy executing instructions, including both global/generic and spill/scratch + operations (see the VMEM instruction count metrics) for more detail). Does not + include VALU operations. Computed as the ratio of the total number of cycles + spent by the scheduler issuing VMEM instructions over the total CU cycles. + Branch Utilization: Indicates what percent of the kernel's duration the branch + unit was busy executing instructions. Computed as the ratio of the total number + of cycles spent by the scheduler issuing branch instructions over the total + CU cycles + VALU Active Threads: Indicates the average level of divergence within a wavefront + over the lifetime of the kernel. The number of work-items that were active in + a wavefront during execution of each VALU instruction, time-averaged over all + VALU instructions run on all wavefronts in the kernel. + IPC: The ratio of the total number of instructions executed on the CU over the + total active CU cycles. This is also presented as a percent of the peak theoretical + bandwidth achievable on the specific accelerator. + Wavefront Occupancy: 'The time-averaged number of wavefronts resident on the accelerator + over the lifetime of the kernel. Note: this metric may be inaccurate for short-running + kernels (less than 1ms). This is also presented as a percent of the peak theoretical + occupancy achievable on the specific accelerator.' + Theoretical LDS Bandwidth: Indicates the maximum amount of bytes that could have + been loaded from, stored to, or atomically updated in the LDS per unit time + (see LDS Bandwidth example for more detail). This is also presented as a percent + of the peak theoretical F64 MFMA operations achievable on the specific accelerator. + LDS Bank Conflicts/Access: The ratio of the number of cycles spent in the LDS + scheduler due to bank conflicts (as determined by the conflict resolution hardware) + to the base number of cycles that would be spent in the LDS scheduler in a completely + uncontended case. This is also presented in normalized form (i.e., the Bank + Conflict Rate). + vL1D Cache Hit Rate: The ratio of the number of vL1D cache line requests that + hit in vL1D cache over the total number of cache line requests to the vL1D cache + RAM. + vL1D Cache BW: The number of bytes looked up in the vL1D cache as a result of + VMEM instructions per unit time. The number of bytes is calculated as the number + of cache lines requested multiplied by the cache line size. This value does + not consider partial requests, so e.g., if only a single value is requested + in a cache line, the data movement will still be counted as a full cache line. + This is also presented as a percent of the peak theoretical bandwidth achievable + on the specific accelerator. + L2 Cache Hit Rate: The ratio of the number of L2 cache line requests that hit + in the L2 cache over the total number of incoming cache line requests to the + L2 cache. + L2 Cache BW: The number of bytes looked up in the L2 cache per unit time. The + number of bytes is calculated as the number of cache lines requested multiplied + by the cache line size. This value does not consider partial requests, so e.g., + if only a single value is requested in a cache line, the data movement will + still be counted as a full cache line. This is also presented as a percent of + the peak theoretical bandwidth achievable on the specific accelerator. + L2-Fabric Read BW: "The number of bytes read by the L2 over the Infinity Fabric\u2122\ + \ interface per unit time. This is also presented as a percent of the peak theoretical\ + \ bandwidth achievable on the specific accelerator." + L2-Fabric Write BW: The number of bytes sent by the L2 over the Infinity Fabric + interface by write and atomic operations per unit time. This is also presented + as a percent of the peak theoretical bandwidth achievable on the specific accelerator. + L2-Fabric Read Latency: The time-averaged number of cycles read requests spent + in Infinity Fabric before data was returned to the L2. + L2-Fabric Write Latency: The time-averaged number of cycles write requests spent + in Infinity Fabric before a completion acknowledgement was returned to the L2. + sL1D Cache Hit Rate: The percent of sL1D requests that hit on a previously loaded + line the cache. Calculated as the ratio of the number of sL1D requests that + hit over the number of all sL1D requests. + sL1D Cache BW: The number of bytes looked up in the sL1D cache per unit time. + This is also presented as a percent of the peak theoretical bandwidth achievable + on the specific accelerator. + L1I Hit Rate: The number of bytes looked up in the L1I cache per unit time. This + is also presented as a percent of the peak theoretical bandwidth achievable + on the specific accelerator. + L1I BW: The percent of L1I requests that hit on a previously loaded line the cache. + Calculated as the ratio of the number of L1I requests that hit over the number + of all L1I requests. + L1I Fetch Latency: The average number of cycles spent to fetch instructions to + a CU. + data source: + - metric_table: + id: 201 + title: System Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + peak: Peak + pop: Pct of Peak + metric: + VALU FLOPs: + value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) + / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) + VALU IOPs: + value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp + - Start_Timestamp))) + unit: GIOP/s + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp + - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) + MFMA FLOPs (F8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + MFMA FLOPs (BF16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) + MFMA FLOPs (F16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) + MFMA FLOPs (F32): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + MFMA FLOPs (F64): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + MFMA IOPs (Int8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GIOP/s + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + Active CUs: + value: $numActiveCUs + unit: CUs + peak: $cu_per_gpu + pop: ((100 * $numActiveCUs) / $cu_per_gpu) + SALU Utilization: + value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + VALU Utilization: + value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + MFMA Utilization: + value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu) * 4))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu) * 4))) + VMEM Utilization: + value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + unit: pct + peak: 100 + pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + Branch Utilization: + value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + peak: 100 + pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + VALU Active Threads: + value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + unit: Threads + peak: $wave_size + pop: (100 * AVG((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU / $wave_size) + if (SQ_ACTIVE_INST_VALU != 0) else None)) + IPC: + value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + unit: Instr/cycle + peak: 5 + pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5) + Wavefront Occupancy: + value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + unit: Wavefronts + peak: ($max_waves_per_cu * $cu_per_gpu) + pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu + * $cu_per_gpu)))) + coll_level: SQ_LEVEL_WAVES + Theoretical LDS Bandwidth: + value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: (($max_sclk * $cu_per_gpu) * 0.128) + pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * 0.00128))) + LDS Bank Conflicts/Access: + value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + unit: Conflicts/access + peak: 32 + pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) / 32) + vL1D Cache Hit Rate: + value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None)) + unit: pct + peak: 100 + pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None)) + vL1D Cache BW: + value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu) + pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) + L2 Cache Hit Rate: + value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + unit: pct + peak: 100 + pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + L2 Cache BW: + value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)) + pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) + / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) + L2-Fabric Read BW: + value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum + - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp + - Start_Timestamp)) + unit: GB/s + peak: $hbmBandwidth + pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum + - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp + - Start_Timestamp)))) / $hbmBandwidth) + L2-Fabric Write BW: + value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: $hbmBandwidth + pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - + TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))) / + $hbmBandwidth) + L2-Fabric Read Latency: + value: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + unit: Cycles + peak: None + pop: None + L2-Fabric Write Latency: + value: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: Cycles + peak: None + pop: None + sL1D Cache Hit Rate: + value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) + if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) + unit: pct + peak: 100 + pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) + if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) + sL1D Cache BW: + value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) + unit: GB/s + peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) + pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * + 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) + L1I Hit Rate: + value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) + unit: pct + peak: 100 + pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) + L1I BW: + value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) + unit: GB/s + peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) + pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * + 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) + L1I Fetch Latency: + value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + unit: Cycles + peak: None + pop: None + coll_level: SQ_IFETCH_LEVEL diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0300_mem_chart.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0300_mem_chart.yaml deleted file mode 100644 index eae47b787f..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0300_mem_chart.yaml +++ /dev/null @@ -1,315 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 300 - title: Memory Chart - data source: - - metric_table: - id: 301 - title: Memory Chart - header: - metric: Metric - #alias: #alias - value: Value - tips: Tips - metric: - # ---------------------------------------- - # Instr Buff Block - - #TODO: double check wave_occupancy - Wavefront Occupancy: - #alias: wave_occ_ - value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs), 0) - coll_level: SQ_LEVEL_WAVES - tips: - Wave Life: - #alias: wave_life_ - value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0) else 0)), 0) - tips: - - # ---------------------------------------- - # Instr Dispatch Block - SALU: - #alias: salu_ - value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0) - tips: - SMEM: - #alias: smem_ - value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0) - tips: - VALU: - #alias: valu_ - value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0) - tips: - MFMA: - #alias: mfma_ - value: ROUND(AVG((SQ_INSTS_MFMA / $denom)), 0) - tips: - VMEM: - #alias: vmem_ - value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0) - tips: - LDS: - #alias: lds_ - value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) - tips: - GWS: - #alias: gws_ - value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0) - tips: - BR: - #alias: br_ - value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0) - tips: - - # ---------------------------------------- - # Exec Block - Active CUs: - #alias: active_cu_ - value: $numActiveCUs - tips: - Num CUs: - #alias: num_cu_ - value: $cu_per_gpu - tips: - VGPR: - #alias: vgpr_ - value: ROUND(AVG(Arch_VGPR), 0) - tips: - # Todo: add AGPRs - SGPR: - #alias: sgpr_ - value: ROUND(AVG(SGPR), 0) - tips: - LDS Allocation: - #alias: lds_alloc_ - value: ROUND(AVG(LDS_Per_Workgroup), 0) - tips: - Scratch Allocation: - #alias: scratch_alloc_ - value: ROUND(AVG(Scratch_Per_Workitem), 0) - tips: - Wavefronts: - #alias: wavefronts_ - value: ROUND(AVG(SPI_CSN_WAVE), 0) - tips: - Workgroups: - #alias: workgroups_ - value: ROUND(AVG(SPI_CSN_NUM_THREADGROUPS), 0) - tips: - - # ---------------------------------------- - # LDS Block - LDS Req: - #alias: lds_req_ - value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) - tips: - LDS Util: - #alias: lds_util_ - value: - ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))), - 0) - tips: - LDS Latency: - #alias: lds_lat - value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)),0) - coll_level: SQ_INST_LEVEL_LDS - tips: - - # ---------------------------------------- - # Vector L1 Cache Block - VL1 Rd: - #alias: vl1_rd_ - value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0) - tips: - VL1 Wr: - #alias: vl1_wr_ - value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0) - tips: - VL1 Atomic: - #alias: vl1_atom_ - value: - ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)), 0) - tips: - - VL1 Hit: - #alias: vl1_hit_ - value: - ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None )), 0) - tips: - VL1 Lat: - #alias: vl1_lat_ - value: - ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum - != 0) else None)), 0) - tips: - VL1 Coalesce: - #alias: vl1_coales_ - value: - ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum - * 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0) - tips: - VL1 Stall: - #alias: vl1_stall_ - value: - ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None)), 0) - tips: - - VL1_L2 Rd: - #alias: vl1_l2_rd_ - value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0) - tips: - VL1_L2 Wr: - #alias: vl1_l2_wr_ - value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0) - tips: - VL1_L2 Atomic: - #alias: vl1_l2_atom_ - value: - ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)), 0) - tips: - - # ---------------------------------------- - # Scalar L1D Cache Block - VL1D Rd: - #alias: sl1_rd_ - value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0) - tips: - VL1D Hit: - #alias: sl1_hit_ - value: - ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ != - 0) else None)) * 100), 0) - tips: - VL1D Lat: - #alias: sl1_lat_ - value: - ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ != - 0) else None)) * 100), 0) - coll_level: SQC_DCACHE_INFLIGHT_LEVEL - tips: - - VL1D_L2 Rd: - #alias: sl1_l2_rd_ - value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0) - tips: - VL1D_L2 Wr: - #alias: sl1_l2_wr_ - value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0) - tips: - VL1D_L2 Atomic: - #alias: sl1_l2_atom_ - value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0) - tips: - - # ---------------------------------------- - # Instr L1 Cache Block - IL1 Fetch: - #alias: il1_fetch_ - value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0) - tips: - IL1 Hit: - #alias: il1_hit_ - value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0) - tips: - IL1 Lat: - #alias: il1_lat_ - value: - ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ != - 0) else None)) * 100), 0) - tips: # ??? coll_level: SQ_IFETCH_LEVEL - IL1_L2 Rd: - #alias: il1_l2_req_ - value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0) - tips: - - # ---------------------------------------- - # L2 Cache Block(inside) - L2 Rd: - #alias: l2_rd_ - value: ROUND(AVG((TCC_READ_sum / $denom)), 0) - tips: - L2 Wr: - #alias: l2_wr_ - value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0) - tips: - L2 Atomic: - #alias: l2_atom_ - value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0) - tips: - L2 Hit: - #alias: l2_hit_ - value: - ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else 0)), 0) - tips: - L2 Rd Lat: - #alias: l2_rd_lat_ - value: - # ROUND(AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) - # if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else None)), - # 0) - tips: - L2 Wr Lat: - #alias: l2_wr_lat_ - value: - # ROUND(AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + - # TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - # != 0) else None)), 0) - tips: - - # ---------------------------------------- - # Fabric Block - Fabric_L2 Rd: - #alias: l2_fabric_rd_ - value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0) - tips: - Fabric_L2 Wr: - #alias: l2_fabric_wr_ - value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0) - tips: - Fabric_L2 Atomic: - #alias: l2_fabric_atom_ - value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0) - tips: - - Fabric Rd Lat: - #alias: fabric_rd_lat_ - value: - ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else 0)), 0) - tips: - Fabric Wr Lat: - #alias: fabric_wr_lat_ - value: - ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else 0)), 0) - tips: - Fabric Atomic Lat: - #alias: fabric_atom_lat_ - value: - ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum - != 0) else 0)), 0) - tips: - - HBM Rd: - #alias: hbm_rd_ - value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0) - tips: - HBM Wr: - #alias: hbm_wr_ - value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0) - tips: - - comparable: false # for now - cli_style: mem_chart diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0300_memory_chart.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0300_memory_chart.yaml new file mode 100644 index 0000000000..1a6587ce82 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0300_memory_chart.yaml @@ -0,0 +1,263 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 300 + title: Memory Chart + metrics_description: + Wavefront Occupancy: Wavefronts per active CU. + Wave Life: Average number of cycles executing a wave. + SALU: Total Number of SALU (Scalar ALU) instructions issued per normalization + unit. + SMEM: Total number of SMEM (Scalar Memory Read) instructions issued normalization + unit. + VALU: The number of VALU (Vector ALU) instructions issued per normalization unit. + MFMA: Total number of MFMA (Matrix-Fused-Multiply-Add) instructions issued per + normalization unit. + VMEM: The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch + memory) per normalization unit. + LDS: The total number of LDS instructions (including, but not limited to, read/write/atomics + and HIP's __shfl instructions) executed per normalization unit. + GWS: Total number of GDS (global data sync) instructions issued per normalization + unit. + BR: Total number of BRANCH instructions issued per normalization unit. + Active CUs: Total number of active compute units (CUs) on the accelerator during + the kernel execution. + Num CUs: Total number of compute units (CUs) on the accelerator. + VGPR: 'The number of architected vector general-purpose registers allocated for + the kernel, see VALU. Note: this may not exactly match the number of VGPRs requested + by the compiler due to allocation granularity.' + SGPR: 'The number of scalar general-purpose registers allocated for the kernel, + see SALU. Note: this may not exactly match the number of SGPRs requested by + the compiler due to allocation granularity.' + LDS Allocation: 'The number of bytes of LDS memory (or, shared memory) allocated + for this kernel. Note: This may also be larger than what was requested at compile + time due to both allocation granularity and dynamic per-dispatch LDS allocations.' + Scratch Allocation: The number of bytes of scratch memory requested per work-item + for this kernel. Scratch memory is used for stack memory on the accelerator, + as well as for register spills and restores. + Wavefronts: The total number of wavefronts, summed over all workgroups, forming + this kernel launch. + Workgroups: The total number of workgroups forming this kernel launch. + LDS Req: The total number of LDS instructions (including, but not limited to, + read/write/atomics and HIP's __shfl instructions) executed per normalization + unit. + LDS Util: Indicates what percent of the kernel's duration the LDS was actively + executing instructions (including, but not limited to, load, store, atomic and + HIP's __shfl operations). Calculated as the ratio of the total number of cycles + LDS was active over the total CU cycles. + LDS Latency: The average number of round-trip cycles (i.e., from issue to data-return + / acknowledgment) required for an LDS instruction to complete. + VL1 Rd: The total number of incoming read requests from the address processing + unit after coalescing per normalization unit + VL1 Wr: The total number of incoming write requests from the address processing + unit after coalescing per normalization unit + VL1 Atomic: The total number of incoming atomic requests from the address processing + unit after coalescing per normalization unit + VL1 Hit: The ratio of the number of vL1D cache line requests that hit in vL1D + cache over the total number of cache line requests to the vL1D Cache RAM. + VL1 Lat: Calculated as the average number of cycles that a vL1D cache line request + spent in the vL1D cache pipeline. + VL1 Coalesce: Indicates how well memory instructions were coalesced by the address + processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). Calculated + as the average number of thread-requests generated per instruction divided by + the ideal number of thread-requests per instruction. + VL1 Stall: The ratio of the number of cycles where the vL1D is stalled waiting + to issue a request for data to the L2 cache divided by the number of cycles + where the vL1D is active. + VL1_L2 Rd: The number of read requests for a vL1D cache line that were not satisfied + by the vL1D and must be retrieved from the to the L2 Cache per normalization + unit. + VL1_L2 Wr: The number of write requests to a vL1D cache line that were sent through + the vL1D to the L2 cache, per normalization unit. + VL1_L2 Atomic: The number of atomic requests that are sent through the vL1D to + the L2 cache, per normalization unit. This includes requests for atomics with, + and without return. + sL1D Rd: The total number of requests, of any size or type, made to the sL1D per + normalization unit. + sL1D Hit: The total number of sL1D requests that hit on a previously loaded cache + line, per normalization unit. + sL1D_L2 Rd: The total number of read requests from sL1D to the L2, per normalization + unit. + sL1D_L2 Wr: The total number of write requests from sL1D to the L2, per normalization + unit. Typically unused on current CDNA accelerators. + sL1D_L2 Atomic: The total number of atomic requests from sL1D to the L2, per normalization + unit. Typically unused on current CDNA accelerators. + IL1 Fetch: The total number of requests made to the L1I per normalization-unit. + IL1 Hit: The percent of L1I requests that hit on a previously loaded line the + cache. Calculated as the ratio of the number of L1I requests that hit over the + number of all L1I requests. + IL1 Lat: The average number of cycles spent to fetch instructions to a CU. + IL1_L2 Rd: The total number of requests across the L1I - L2 interface per normalization-unit. + L2 Rd: The total number of read requests to the L2 from all clients. + L2 Wr: The total number of write requests to the L2 from all clients. + L2 Atomic: The total number of atomic requests (with and without return) to the + L2 from all clients. + L2 Hit: The ratio of the number of L2 cache line requests that hit in the L2 cache + over the total number of incoming cache line requests to the L2 cache. + L2 Rd Lat: Calculated as the average number of cycles that the vL1D cache took + to issue and receive read requests from the L2 Cache. This number also includes + requests for atomics with return values. + L2 Wr Lat: Calculated as the average number of cycles that the vL1D cache took + to issue and receive acknowledgement of a write request to the L2 Cache. This + number also includes requests for atomics without return values. + Fabric_L2 Rd: Number of L2 cache - Infinity Fabric read requests (either 32-byte + or 64-byte) summed over TCC instances per normalization unit. + Fabric_L2 Wr: Number of L2 cache - Infinity Fabric write requests (either 32-byte + or 64-byte) summed over TCC instances per normalization unit. + Fabric_L2 Atomic: Number of L2 cache - Infinity Fabric write requests (either + 32-byte or 64-byte) that are actually atomic requests summed over TCC instances + per normalization unit. + Fabric Rd Lat: The time-averaged number of cycles read requests spent in Infinity + Fabric before data was returned to the L2. + Fabric Wr Lat: The time-averaged number of cycles write requests spent in Infinity + Fabric before a completion acknowledgement was returned to the L2. + Fabric Atomic Lat: The time-averaged number of cycles atomic requests spent in + Infinity Fabric before a completion acknowledgement (atomic without return value) + or data (atomic with return value) was returned to the L2. + HBM Rd: The total number of L2 requests to Infinity Fabric to read 32B or 64B + of data from the accelerator's local HBM, per normalization unit. + HBM Wr: 'The total number of L2 requests to Infinity Fabric to write or atomically + update 32B or 64B of data in the accelerator''s local HBM, per normalization + unit. ' + data source: + - metric_table: + id: 301 + title: Memory Chart + header: + metric: Metric + value: Value + metric: + Wavefront Occupancy: + value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs), + 0) + coll_level: SQ_LEVEL_WAVES + Wave Life: + value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0) else + 0)), 0) + SALU: + value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0) + SMEM: + value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0) + VALU: + value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0) + MFMA: + value: ROUND(AVG((SQ_INSTS_MFMA / $denom)), 0) + VMEM: + value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0) + LDS: + value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) + GWS: + value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0) + BR: + value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0) + Active CUs: + value: $numActiveCUs + Num CUs: + value: $cu_per_gpu + VGPR: + value: ROUND(AVG(Arch_VGPR), 0) + SGPR: + value: ROUND(AVG(SGPR), 0) + LDS Allocation: + value: ROUND(AVG(LDS_Per_Workgroup), 0) + Scratch Allocation: + value: ROUND(AVG(Scratch_Per_Workitem), 0) + Wavefronts: + value: ROUND(AVG(SPI_CSN_WAVE), 0) + Workgroups: + value: ROUND(AVG(SPI_CSN_NUM_THREADGROUPS), 0) + LDS Req: + value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) + LDS Util: + value: ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))), 0) + LDS Latency: + value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS + != 0) else None)),0) + coll_level: SQ_INST_LEVEL_LDS + VL1 Rd: + value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0) + VL1 Wr: + value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0) + VL1 Atomic: + value: ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)), 0) + VL1 Hit: + value: ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None )), 0) + VL1 Lat: + value: ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum + != 0) else None)), 0) + VL1 Coalesce: + value: ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum + * 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0) + VL1 Stall: + value: ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None)), 0) + VL1_L2 Rd: + value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0) + VL1_L2 Wr: + value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0) + VL1_L2 Atomic: + value: ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)), 0) + sL1D Rd: + value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0) + sL1D Hit: + value: ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ + != 0) else None)) * 100), 0) + sL1D Lat: + value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ + != 0) else None)) * 100), 0) + coll_level: SQC_DCACHE_INFLIGHT_LEVEL + sL1D_L2 Rd: + value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0) + sL1D_L2 Wr: + value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0) + sL1D_L2 Atomic: + value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0) + IL1 Fetch: + value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0) + IL1 Hit: + value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0) + IL1 Lat: + value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ + != 0) else None)) * 100), 0) + IL1_L2 Rd: + value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0) + L2 Rd: + value: ROUND(AVG((TCC_READ_sum / $denom)), 0) + L2 Wr: + value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0) + L2 Atomic: + value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0) + L2 Hit: + value: ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if + ((TCC_HIT_sum + TCC_MISS_sum) != 0) else 0)), 0) + L2 Rd Lat: + value: null + L2 Wr Lat: + value: null + Fabric_L2 Rd: + value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0) + Fabric_L2 Wr: + value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0) + Fabric_L2 Atomic: + value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0) + Fabric Rd Lat: + value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else 0)), 0) + Fabric Wr Lat: + value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else 0)), 0) + Fabric Atomic Lat: + value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else 0)), 0) + HBM Rd: + value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0) + HBM Wr: + value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0) + comparable: false + cli_style: mem_chart + tui_style: mem_chart diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0400_roofline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0400_roofline.yaml new file mode 100644 index 0000000000..41c8bac547 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0400_roofline.yaml @@ -0,0 +1,9 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 400 + title: Roofline + metrics_description: {} + data source: + - None: + id: 401 + title: Roofline diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0400_roofline_info.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0400_roofline_info.yaml deleted file mode 100644 index 1474b85cf2..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0400_roofline_info.yaml +++ /dev/null @@ -1,8 +0,0 @@ ---- -Panel Config: - id: 400 - title: Roofline - data source: - - None: - id: 401 - title: Roofline \ No newline at end of file diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0500_command-processor.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0500_command-processor.yaml deleted file mode 100644 index 164b3552bf..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0500_command-processor.yaml +++ /dev/null @@ -1,135 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 500 - title: Command Processor (CPC/CPF) - data source: - - metric_table: - id: 501 - title: Command Processor Fetcher - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - CPF Utilization: - avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) - if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) - min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) - if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) - max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) - if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) - unit: pct - tips: - CPF Stall: - avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None)) - min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None)) - max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None)) - unit: pct - tips: - CPF-L2 Utilization: - avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) - if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) - min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) - if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) - max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) - if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) - unit: pct - tips: - CPF-L2 Stall: - avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY - != 0) else None)) - min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY - != 0) else None)) - max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY - != 0) else None)) - unit: pct - tips: - CPF-UTCL1 Stall: - avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None) - min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None) - max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None) - unit: pct - tips: - - - metric_table: - id: 502 - title: Packet Processor - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - CPC Utilization: - avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) - if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) - min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) - if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) - max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) - if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) - unit: pct - tips: - CPC Stall Rate: - avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - unit: pct - tips: - CPC Packet Decoding Utilization: - avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) - min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) - max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) - unit: pct - tips: - CPC-Workgroup Manager Utilization: - avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) - min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) - max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) - unit: Pct - tips: - CPC-L2 Utilization: - avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) - if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) - min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) - if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) - max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) - if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) - unit: pct - tips: - CPC-UTCL1 Stall: - avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None) - min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None) - max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None) - unit: pct - tips: - CPC-UTCL2 Utilization: - avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) - if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) - min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) - if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) - max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) - if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) - unit: pct - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0500_command_processor_cpc_cpf.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0500_command_processor_cpc_cpf.yaml new file mode 100644 index 0000000000..c4d2cabf52 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0500_command_processor_cpc_cpf.yaml @@ -0,0 +1,145 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 500 + title: Command Processor (CPC/CPF) + metrics_description: + CPF Utilization: Percent of total cycles where the CPF was busy actively doing + any work. The ratio of CPF busy cycles over total cycles counted by the CPF. + CPF Stall: Percent of CPF busy cycles where the CPF was stalled for any reason. + CPF-L2 Utilization: Percent of total cycles counted by the CPF-L2 interface where + the CPF-L2 interface was active doing any work. The ratio of CPF-L2 busy cycles + over total cycles counted by the CPF-L2. + CPF-L2 Stall: Percent of CPF-L2 L2 busy cycles where the CPF-L2 interface was + stalled for any reason. + CPF-UTCL1 Stall: Percent of CPF busy cycles where the CPF was stalled by address + translation. + CPC Utilization: Percent of total cycles where the CPC was busy actively doing + any work. The ratio of CPC busy cycles over total cycles counted by the CPC. + CPC Stall Rate: Percent of CPC busy cycles where the CPC was stalled for any reason. + CPC Packet Decoding Utilization: Percent of CPC busy cycles spent decoding commands + for processing. + CPC-Workgroup Manager Utilization: Percent of CPC busy cycles spent dispatching + workgroups to the workgroup manager. + CPC-L2 Utilization: Percent of total cycles counted by the CPC-L2 interface where + the CPC-L2 interface was active doing any work. + CPC-UTCL1 Stall: Percent of CPC busy cycles where the CPC was stalled by address + translation + CPC-UTCL2 Utilization: 'Percent of total cycles counted by the CPC''s L2 address + translation interface where the CPC was busy doing address translation work. ' + data source: + - metric_table: + id: 501 + title: Command processor fetcher (CPF) + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + CPF Utilization: + avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) + if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) + min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) + if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) + max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) + if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) + unit: pct + CPF Stall: + avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None)) + min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None)) + max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None)) + unit: pct + CPF-L2 Utilization: + avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) + if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) + min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) + if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) + max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) + if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) + unit: pct + CPF-L2 Stall: + avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY + != 0) else None)) + min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY + != 0) else None)) + max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY + != 0) else None)) + unit: pct + CPF-UTCL1 Stall: + avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) + if (CPF_CPF_STAT_BUSY != 0) else None) + min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) + if (CPF_CPF_STAT_BUSY != 0) else None) + max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) + if (CPF_CPF_STAT_BUSY != 0) else None) + unit: pct + - metric_table: + id: 502 + title: Command processor packet processor (CPC) + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + CPC Utilization: + avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) + if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) + min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) + if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) + max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) + if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) + unit: pct + CPC Stall Rate: + avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + unit: pct + CPC Packet Decoding Utilization: + avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + unit: pct + CPC-Workgroup Manager Utilization: + avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + unit: Pct + CPC-L2 Utilization: + avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) + if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) + min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) + if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) + max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) + if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) + unit: pct + CPC-UTCL1 Stall: + avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if + (CPC_CPC_STAT_BUSY != 0) else None) + min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if + (CPC_CPC_STAT_BUSY != 0) else None) + max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if + (CPC_CPC_STAT_BUSY != 0) else None) + unit: pct + CPC-UTCL2 Utilization: + avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) + if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) + min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) + if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) + max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) + if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) + unit: pct diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0600_shader-processor-input.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0600_shader-processor-input.yaml deleted file mode 100644 index c78c3645a0..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0600_shader-processor-input.yaml +++ /dev/null @@ -1,167 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 600 - title: Workgroup Manager (SPI) - data source: - - metric_table: - id: 601 - title: Workgroup Manager Utilizations - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Accelerator Utilization: - avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) - min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) - max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) - unit: Pct - tips: - Scheduler-Pipe Utilization: - avg: AVG(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - min: MIN(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - max: MAX(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - unit: Pct - tips: - Workgroup Manager Utilization: - avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) - min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) - max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) - unit: Pct - tips: - Shader Engine Utilization: - avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) - min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) - max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) - unit: Pct - tips: - SIMD Utilization: - avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Dispatched Workgroups: - avg: AVG(SPI_CSN_NUM_THREADGROUPS) - min: MIN(SPI_CSN_NUM_THREADGROUPS) - max: MAX(SPI_CSN_NUM_THREADGROUPS) - unit: Workgroups - tips: - Dispatched Wavefronts: - avg: AVG(SPI_CSN_WAVE) - min: MIN(SPI_CSN_WAVE) - max: MAX(SPI_CSN_WAVE) - unit: Wavefronts - tips: - VGPR Writes: - avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - unit: Cycles/wave - tips: - SGPR Writes: - avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - unit: Cycles/wave - tips: - - metric_table: - id: 602 - title: Workgroup Manager - Resource Allocation - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Not-scheduled Rate (Workgroup Manager): - avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None) - min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None) - max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None) - unit: Pct - tips: - Not-scheduled Rate (Scheduler-Pipe): - avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None) - min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None) - max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None) - unit: Pct - tips: - Scheduler-Pipe Stall Rate: - avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None)) - min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None)) - max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None)) - unit: Pct - tips: - Scratch Stall Rate: - avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - unit: Pct - tips: - Insufficient SIMD Waveslots: - avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Insufficient SIMD VGPRs: - avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Insufficient SIMD SGPRs: - avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Insufficient CU LDS: - avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Insufficient CU Barriers: - avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Reached CU Workgroup Limit: - avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Reached CU Wavefront Limit: - avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0600_workgroup_manager_spi.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0600_workgroup_manager_spi.yaml new file mode 100644 index 0000000000..f6bf13d8b8 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0600_workgroup_manager_spi.yaml @@ -0,0 +1,201 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 600 + title: Workgroup Manager (SPI) + metrics_description: + Accelerator Utilization: The percent of cycles in the kernel where the accelerator + was actively doing any work. + Scheduler-Pipe Utilization: The percent of total scheduler-pipe cycles in the + kernel where the scheduler-pipes were actively doing any work. + Workgroup Manager Utilization: The percent of cycles in the kernel where the workgroup + manager was actively doing any work. + Shader Engine Utilization: The percent of total shader engine cycles in the kernel + where any CU in a shader-engine was actively doing any work, normalized over + all shader-engines. Low values (e.g., << 100%) indicate that the accelerator + was not fully saturated by the kernel, or a potential load-imbalance issue. + SIMD Utilization: The percent of total SIMD cycles in the kernel where any SIMD + on a CU was actively doing any work, summed over all CUs. Low values (less than + 100%) indicate that the accelerator was not fully saturated by the kernel, or + a potential load-imbalance issue. + Dispatched Workgroups: The total number of workgroups forming this kernel launch. + Dispatched Wavefronts: The total number of wavefronts, summed over all workgroups, + forming this kernel launch. + VGPR Writes: The average number of cycles spent initializing VGPRs at wave creation. + SGPR Writes: The average number of cycles spent initializing SGPRs at wave creation. + Not-scheduled Rate (Workgroup Manager): The percent of total scheduler-pipe cycles + in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck + within the workgroup manager rather than a lack of a CU or SIMD with sufficient + resources. + Not-scheduled Rate (Scheduler-Pipe): 'The percent of total scheduler-pipe cycles + in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck + within the scheduler-pipes rather than a lack of a CU or SIMD with sufficient + resources. ' + Scheduler-Pipe Stall Rate: The percent of total scheduler-pipe cycles in the kernel + where a workgroup could not be scheduled to a CU due to occupancy limitations + (like a lack of a CU or SIMD with sufficient resources). + Scratch Stall Rate: The percent of total shader-engine cycles in the kernel where + a workgroup could not be scheduled to a CU due to lack of private (a.k.a., scratch) + memory slots. While this can reach up to 100%, note that the actual occupancy + limitations on a kernel using private memory are typically quite small (for + example, less than 1% of the total number of waves that can be scheduled to + an accelerator). + Insufficient SIMD Waveslots: The percent of total SIMD cycles in the kernel where + a workgroup could not be scheduled to a SIMD due to lack of available waveslots. + Insufficient SIMD VGPRs: The percent of total SIMD cycles in the kernel where + a workgroup could not be scheduled to a SIMD due to lack of available VGPRs. + Insufficient SIMD SGPRs: The percent of total SIMD cycles in the kernel where + a workgroup could not be scheduled to a SIMD due to lack of available SGPRs. + Insufficient CU LDS: The percent of total CU cycles in the kernel where a workgroup + could not be scheduled to a CU due to lack of available LDS. + Insufficient CU Barriers: The percent of total CU cycles in the kernel where a + workgroup could not be scheduled to a CU due to lack of available barriers. + Reached CU Workgroup Limit: The percent of total CU cycles in the kernel where + a workgroup could not be scheduled to a CU due to limits within the workgroup + manager. This is expected to be always be zero on CDNA2 or newer accelerators + (and small for previous accelerators). + Reached CU Wavefront Limit: The percent of total CU cycles in the kernel where + a wavefront could not be scheduled to a CU due to limits within the workgroup + manager. This is expected to be always be zero on CDNA2 or newer accelerators + (and small for previous accelerators). + data source: + - metric_table: + id: 601 + title: Workgroup manager utilizations + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Accelerator Utilization: + avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) + min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) + max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) + unit: Pct + Scheduler-Pipe Utilization: + avg: AVG(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu + * $se_per_gpu)) + min: MIN(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu + * $se_per_gpu)) + max: MAX(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu + * $se_per_gpu)) + unit: Pct + Workgroup Manager Utilization: + avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) + min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) + max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) + unit: Pct + Shader Engine Utilization: + avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) + min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) + max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) + unit: Pct + SIMD Utilization: + avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Dispatched Workgroups: + avg: AVG(SPI_CSN_NUM_THREADGROUPS) + min: MIN(SPI_CSN_NUM_THREADGROUPS) + max: MAX(SPI_CSN_NUM_THREADGROUPS) + unit: Workgroups + Dispatched Wavefronts: + avg: AVG(SPI_CSN_WAVE) + min: MIN(SPI_CSN_WAVE) + max: MAX(SPI_CSN_WAVE) + unit: Wavefronts + VGPR Writes: + avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + unit: Cycles/wave + SGPR Writes: + avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + unit: Cycles/wave + - metric_table: + id: 602 + title: Workgroup Manager - Resource Allocation + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Not-scheduled Rate (Workgroup Manager): + avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + unit: Pct + Not-scheduled Rate (Scheduler-Pipe): + avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + unit: Pct + Scheduler-Pipe Stall Rate: + avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) + min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) + max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) + unit: Pct + Scratch Stall Rate: + avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + unit: Pct + Insufficient SIMD Waveslots: + avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Insufficient SIMD VGPRs: + avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Insufficient SIMD SGPRs: + avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Insufficient CU LDS: + avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Insufficient CU Barriers: + avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Reached CU Workgroup Limit: + avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Reached CU Wavefront Limit: + avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0700_wavefront-launch.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0700_wavefront-launch.yaml deleted file mode 100644 index cc650e9bc0..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0700_wavefront-launch.yaml +++ /dev/null @@ -1,142 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 700 - title: Wavefront - data source: - - metric_table: - id: 701 - title: Wavefront Launch Stats - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Grid Size: - avg: AVG(Grid_Size) - min: MIN(Grid_Size) - max: MAX(Grid_Size) - unit: Work Items - tips: - Workgroup Size: - avg: AVG(Workgroup_Size) - min: MIN(Workgroup_Size) - max: MAX(Workgroup_Size) - unit: Work Items - tips: - Total Wavefronts: - avg: AVG(SPI_CSN_WAVE) - min: MIN(SPI_CSN_WAVE) - max: MAX(SPI_CSN_WAVE) - unit: Wavefronts - tips: - Saved Wavefronts: - avg: AVG(SQ_WAVES_SAVED) - min: MIN(SQ_WAVES_SAVED) - max: MAX(SQ_WAVES_SAVED) - unit: Wavefronts - tips: - Restored Wavefronts: - avg: AVG(SQ_WAVES_RESTORED) - min: MIN(SQ_WAVES_RESTORED) - max: MAX(SQ_WAVES_RESTORED) - unit: Wavefronts - tips: - VGPRs: - avg: AVG(Arch_VGPR) - min: MIN(Arch_VGPR) - max: MAX(Arch_VGPR) - unit: Registers - tips: - AGPRs: - avg: AVG(Accum_VGPR) - min: MIN(Accum_VGPR) - max: MAX(Accum_VGPR) - unit: Registers - tips: - SGPRs: - avg: AVG(SGPR) - min: MIN(SGPR) - max: MAX(SGPR) - unit: Registers - tips: - LDS Allocation: - avg: AVG(LDS_Per_Workgroup) - min: MIN(LDS_Per_Workgroup) - max: MAX(LDS_Per_Workgroup) - unit: Bytes - tips: - Scratch Allocation: - avg: AVG(Scratch_Per_Workitem) - min: MIN(Scratch_Per_Workitem) - max: MAX(Scratch_Per_Workitem) - unit: Bytes/Workitem - tips: - - - metric_table: - id: 702 - title: Wavefront Runtime Stats - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Kernel Time: - avg: AVG((End_Timestamp - Start_Timestamp)) - min: MIN((End_Timestamp - Start_Timestamp)) - max: MAX((End_Timestamp - Start_Timestamp)) - unit: ns - tips: - Kernel Time (Cycles): - avg: AVG($GRBM_GUI_ACTIVE_PER_XCD) - min: MIN($GRBM_GUI_ACTIVE_PER_XCD) - max: MAX($GRBM_GUI_ACTIVE_PER_XCD) - unit: Cycle - tips: - Instructions per wavefront: - avg: AVG((SQ_INSTS / SQ_WAVES)) - min: MIN((SQ_INSTS / SQ_WAVES)) - max: MAX((SQ_INSTS / SQ_WAVES)) - unit: Instr/wavefront - tips: - Wave Cycles: - avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom)) - min: MIN(((4 * SQ_WAVE_CYCLES) / $denom)) - max: MAX(((4 * SQ_WAVE_CYCLES) / $denom)) - unit: (Cycles + $normUnit) - tips: - Dependency Wait Cycles: - avg: AVG(((4 * SQ_WAIT_ANY) / $denom)) - min: MIN(((4 * SQ_WAIT_ANY) / $denom)) - max: MAX(((4 * SQ_WAIT_ANY) / $denom)) - unit: (Cycles + $normUnit) - tips: - Issue Wait Cycles: - avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom)) - min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom)) - max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom)) - unit: (Cycles + $normUnit) - tips: - Active Cycles: - avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom)) - min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom)) - max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom)) - unit: (Cycles + $normUnit) - tips: - Wavefront Occupancy: - avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - unit: Wavefronts - coll_level: SQ_LEVEL_WAVES - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0700_wavefront.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0700_wavefront.yaml new file mode 100644 index 0000000000..5e332c0b8f --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0700_wavefront.yaml @@ -0,0 +1,173 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 700 + title: Wavefront + metrics_description: + Grid Size: The total number of work-items (or, threads) launched as a part of + the kernel dispatch. In HIP, this is equivalent to the total grid size multiplied + by the total workgroup (or, block) size. + Workgroup Size: The total number of work-items (or, threads) in each workgroup + (or, block) launched as part of the kernel dispatch. In HIP, this is equivalent + to the total block size. + Total Wavefronts: "The total number of wavefronts launched as part of the kernel\ + \ dispatch. On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs,\ + \ the wavefront size is always 64 work-items. Thus, the total number of wavefronts\ + \ should be equivalent to the ceiling of grid size divided by 64." + Saved Wavefronts: The total number of wavefronts saved at a context-save. + Restored Wavefronts: The total number of wavefronts restored from a context-save. + VGPRs: 'The number of architected vector general-purpose registers allocated for + the kernel, see VALU. Note: this may not exactly match the number of VGPRs requested + by the compiler due to allocation granularity.' + AGPRs: 'The number of accumulation vector general-purpose registers allocated + for the kernel, see AGPRs. Note: this may not exactly match the number of AGPRs + requested by the compiler due to allocation granularity.' + SGPRs: 'The number of scalar general-purpose registers allocated for the kernel, + see SALU. Note: this may not exactly match the number of SGPRs requested by + the compiler due to allocation granularity.' + LDS Allocation: 'The number of bytes of LDS memory (or, shared memory) allocated + for this kernel. Note: This may also be larger than what was requested at compile + time due to both allocation granularity and dynamic per-dispatch LDS allocations.' + Scratch Allocation: The number of bytes of scratch memory requested per work-item + for this kernel. Scratch memory is used for stack memory on the accelerator, + as well as for register spills and restores. + Kernel Time: The total duration of the executed kernel. + Kernel Time (Cycles): The total duration of the executed kernel in cycles. + Instructions per wavefront: The average number of instructions (of all types) + executed per wavefront. This is averaged over all wavefronts in a kernel dispatch. + Wave Cycles: The number of cycles a wavefront in the kernel dispatch spent resident + on a compute unit per normalization unit. This is averaged over all wavefronts + in a kernel dispatch. + Dependency Wait Cycles: The number of cycles a wavefront in the kernel dispatch + spent resident on a compute unit per normalization unit. This is averaged over + all wavefronts in a kernel dispatch. + Issue Wait Cycles: The number of cycles a wavefront in the kernel dispatch was + unable to issue an instruction for any reason (e.g., execution pipe back-pressure, + arbitration loss, etc.) per normalization unit. This counter is incremented + at every cycle by all wavefronts on a CU unable to issue an instruction. As + such, it is most useful to get a sense of how waves were spending their time, + rather than identification of a precise limiter because another wave could be + actively executing while a wave is issue stalled. The sum of this metric, Dependency + Wait Cycles and Active Cycles should be equal to the total Wave Cycles metric. + Active Cycles: The average number of cycles a wavefront in the kernel dispatch + was actively executing instructions per normalization unit. This measurement + is made on a per-wavefront basis, and may include cycles that another wavefront + spent actively executing (on another execution unit, for example) or was stalled. + As such, it is most useful to get a sense of how waves were spending their time, + rather than identification of a precise limiter. The sum of this metric, Issue + Wait Cycles and Active Wait Cycles should be equal to the total Wave Cycles + metric. + Wavefront Occupancy: 'The time-averaged number of wavefronts resident on the accelerator + over the lifetime of the kernel. Note: this metric may be inaccurate for short-running + kernels (less than 1ms).' + data source: + - metric_table: + id: 701 + title: Wavefront Launch Stats + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Grid Size: + avg: AVG(Grid_Size) + min: MIN(Grid_Size) + max: MAX(Grid_Size) + unit: Work Items + Workgroup Size: + avg: AVG(Workgroup_Size) + min: MIN(Workgroup_Size) + max: MAX(Workgroup_Size) + unit: Work Items + Total Wavefronts: + avg: AVG(SPI_CSN_WAVE) + min: MIN(SPI_CSN_WAVE) + max: MAX(SPI_CSN_WAVE) + unit: Wavefronts + Saved Wavefronts: + avg: AVG(SQ_WAVES_SAVED) + min: MIN(SQ_WAVES_SAVED) + max: MAX(SQ_WAVES_SAVED) + unit: Wavefronts + Restored Wavefronts: + avg: AVG(SQ_WAVES_RESTORED) + min: MIN(SQ_WAVES_RESTORED) + max: MAX(SQ_WAVES_RESTORED) + unit: Wavefronts + VGPRs: + avg: AVG(Arch_VGPR) + min: MIN(Arch_VGPR) + max: MAX(Arch_VGPR) + unit: Registers + AGPRs: + avg: AVG(Accum_VGPR) + min: MIN(Accum_VGPR) + max: MAX(Accum_VGPR) + unit: Registers + SGPRs: + avg: AVG(SGPR) + min: MIN(SGPR) + max: MAX(SGPR) + unit: Registers + LDS Allocation: + avg: AVG(LDS_Per_Workgroup) + min: MIN(LDS_Per_Workgroup) + max: MAX(LDS_Per_Workgroup) + unit: Bytes + Scratch Allocation: + avg: AVG(Scratch_Per_Workitem) + min: MIN(Scratch_Per_Workitem) + max: MAX(Scratch_Per_Workitem) + unit: Bytes/Workitem + - metric_table: + id: 702 + title: Wavefront Runtime Stats + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Kernel Time: + avg: AVG((End_Timestamp - Start_Timestamp)) + min: MIN((End_Timestamp - Start_Timestamp)) + max: MAX((End_Timestamp - Start_Timestamp)) + unit: ns + Kernel Time (Cycles): + avg: AVG($GRBM_GUI_ACTIVE_PER_XCD) + min: MIN($GRBM_GUI_ACTIVE_PER_XCD) + max: MAX($GRBM_GUI_ACTIVE_PER_XCD) + unit: Cycle + Instructions per wavefront: + avg: AVG((SQ_INSTS / SQ_WAVES)) + min: MIN((SQ_INSTS / SQ_WAVES)) + max: MAX((SQ_INSTS / SQ_WAVES)) + unit: Instr/wavefront + Wave Cycles: + avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom)) + min: MIN(((4 * SQ_WAVE_CYCLES) / $denom)) + max: MAX(((4 * SQ_WAVE_CYCLES) / $denom)) + unit: (Cycles + $normUnit) + Dependency Wait Cycles: + avg: AVG(((4 * SQ_WAIT_ANY) / $denom)) + min: MIN(((4 * SQ_WAIT_ANY) / $denom)) + max: MAX(((4 * SQ_WAIT_ANY) / $denom)) + unit: (Cycles + $normUnit) + Issue Wait Cycles: + avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom)) + min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom)) + max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom)) + unit: (Cycles + $normUnit) + Active Cycles: + avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom)) + min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom)) + max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom)) + unit: (Cycles + $normUnit) + Wavefront Occupancy: + avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + unit: Wavefronts + coll_level: SQ_LEVEL_WAVES diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1000_compute-unit-instruction-mix.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1000_compute-unit-instruction-mix.yaml deleted file mode 100644 index 83ba5367a7..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1000_compute-unit-instruction-mix.yaml +++ /dev/null @@ -1,277 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1000 - title: Compute Units - Instruction Mix - data source: - - metric_table: - id: 1001 - title: Overall Instruction Mix - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - VALU: - avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) - min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) - max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) - unit: (instr + $normUnit) - tips: - VMEM: - # TODO: need to fix this when the new FLAT/LDS counts - # are present in ROCm - avg: AVG(((SQ_INSTS_VMEM) / $denom)) - min: MIN(((SQ_INSTS_VMEM) / $denom)) - max: MAX(((SQ_INSTS_VMEM) / $denom)) - unit: (instr + $normUnit) - tips: - LDS: - # TODO: need to fix this when the new FLAT/LDS counts - # are present in ROCm - avg: AVG((SQ_INSTS_LDS / $denom)) - min: MIN((SQ_INSTS_LDS / $denom)) - max: MAX((SQ_INSTS_LDS / $denom)) - unit: (instr + $normUnit) - tips: - MFMA: - avg: AVG((SQ_INSTS_MFMA / $denom)) - min: MIN((SQ_INSTS_MFMA / $denom)) - max: MAX((SQ_INSTS_MFMA / $denom)) - unit: (instr + $normUnit) - tips: - SALU: - avg: AVG((SQ_INSTS_SALU / $denom)) - min: MIN((SQ_INSTS_SALU / $denom)) - max: MAX((SQ_INSTS_SALU / $denom)) - unit: (instr + $normUnit) - tips: - SMEM: - avg: AVG((SQ_INSTS_SMEM / $denom)) - min: MIN((SQ_INSTS_SMEM / $denom)) - max: MAX((SQ_INSTS_SMEM / $denom)) - unit: (instr + $normUnit) - tips: - Branch: - avg: AVG((SQ_INSTS_BRANCH / $denom)) - min: MIN((SQ_INSTS_BRANCH / $denom)) - max: MAX((SQ_INSTS_BRANCH / $denom)) - unit: (instr + $normUnit) - tips: - - - metric_table: - id: 1002 - title: VALU Arithmetic Instr Mix - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - INT32: - avg: AVG((SQ_INSTS_VALU_INT32 / $denom)) - min: MIN((SQ_INSTS_VALU_INT32 / $denom)) - max: MAX((SQ_INSTS_VALU_INT32 / $denom)) - unit: (instr + $normUnit) - tips: - INT64: - avg: AVG((SQ_INSTS_VALU_INT64 / $denom)) - min: MIN((SQ_INSTS_VALU_INT64 / $denom)) - max: MAX((SQ_INSTS_VALU_INT64 / $denom)) - unit: (instr + $normUnit) - tips: - F16-ADD: - avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom)) - unit: (instr + $normUnit) - tips: - F16-MUL: - avg: AVG((SQ_INSTS_VALU_MUL_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_MUL_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_MUL_F16 / $denom)) - unit: (instr + $normUnit) - tips: - F16-FMA: - avg: AVG((SQ_INSTS_VALU_FMA_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_FMA_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_FMA_F16 / $denom)) - unit: (instr + $normUnit) - tips: - F16-Trans: - avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom)) - unit: (instr + $normUnit) - tips: - F32-ADD: - avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom)) - unit: (instr + $normUnit) - tips: - F32-MUL: - avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom)) - unit: (instr + $normUnit) - tips: - F32-FMA: - avg: AVG((SQ_INSTS_VALU_FMA_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_FMA_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_FMA_F32 / $denom)) - unit: (instr + $normUnit) - tips: - F32-Trans: - avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom)) - unit: (instr + $normUnit) - tips: - F64-ADD: - avg: AVG((SQ_INSTS_VALU_ADD_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_ADD_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_ADD_F64 / $denom)) - unit: (instr + $normUnit) - tips: - F64-MUL: - avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom)) - unit: (instr + $normUnit) - tips: - F64-FMA: - avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom)) - unit: (instr + $normUnit) - tips: - F64-Trans: - avg: AVG((SQ_INSTS_VALU_TRANS_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_TRANS_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_TRANS_F64 / $denom)) - unit: (instr + $normUnit) - tips: - Conversion: - avg: AVG((SQ_INSTS_VALU_CVT / $denom)) - min: MIN((SQ_INSTS_VALU_CVT / $denom)) - max: MAX((SQ_INSTS_VALU_CVT / $denom)) - unit: (instr + $normUnit) - tips: - - - metric_table: - id: 1003 - title: VMEM Instr Mix - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Global/Generic Instr: - avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Global/Generic Read: - avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Global/Generic Write: - avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Global/Generic Atomic: - avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Spill/Stack Instr: - avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Spill/Stack Read: - avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Spill/Stack Write: - avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Spill/Stack Atomic: - avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - - - metric_table: - id: 1004 - title: MFMA Arithmetic Instr Mix - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - MFMA-I8: - avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom)) - unit: (instr + $normUnit) - tips: - MFMA-F8: - avg: AVG((SQ_INSTS_VALU_MFMA_F8 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F8 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F8 / $denom)) - unit: (instr + $normUnit) - tips: - MFMA-F16: - avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom)) - unit: (instr + $normUnit) - tips: - MFMA-BF16: - avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom)) - unit: (instr + $normUnit) - tips: - MFMA-F32: - avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom)) - unit: (instr + $normUnit) - tips: - MFMA-F64: - avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom)) - unit: (instr + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1000_compute_units_instruction_mix.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1000_compute_units_instruction_mix.yaml new file mode 100644 index 0000000000..9c923d7bb7 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1000_compute_units_instruction_mix.yaml @@ -0,0 +1,309 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1000 + title: Compute Units - Instruction Mix + metrics_description: + VALU: The total number of vector arithmetic logic unit (VALU) operations issued. + These are the workhorses of the compute unit, and are used to execute a wide + range of instruction types including floating point operations, non-uniform + address calculations, transcendental operations, integer operations, shifts, + conditional evaluation, etc. + VMEM: The total number of vector memory operations issued. These include most + loads, stores and atomic operations and all accesses to generic, global, private + and texture memory. + LDS: The total number of LDS (also known as shared memory) operations issued. + These include loads, stores, atomics, and HIP's __shfl operations. + MFMA: The total number of matrix fused multiply-add instructions issued. + SALU: The total number of scalar arithmetic logic unit (SALU) operations issued. + Typically these are used for address calculations, literal constants, and other + operations that are provably uniform across a wavefront. Although scalar memory + (SMEM) operations are issued by the SALU, they are counted separately in this + section. + SMEM: The total number of scalar memory (SMEM) operations issued. These are typically + used for loading kernel arguments, base-pointers and loads from HIP's __constant__ + memory. + Branch: The total number of branch operations issued. These typically consist + of jump or branch operations and are used to implement control flow. + INT32: The total number of instructions operating on 32-bit integer operands issued + to the VALU per normalization unit. + INT64: The total number of instructions operating on 64-bit integer operands issued + to the VALU per normalization unit. + F16-ADD: The total number of addition instructions operating on 16-bit floating-point + operands issued to the VALU per normalization unit. + F16-MUL: The total number of multiplication instructions operating on 16-bit floating-point + operands issued to the VALU per normalization unit. + F16-FMA: The total number of fused multiply-add instructions operating on 16-bit + floating-point operands issued to the VALU per normalization unit. + F16-Trans: The total number of transcendental instructions (e.g., sqrt) operating + on 16-bit floating-point operands issued to the VALU per normalization unit. + F32-ADD: The total number of addition instructions operating on 32-bit floating-point + operands issued to the VALU per normalization unit. + F32-MUL: The total number of multiplication instructions operating on 32-bit floating-point + operands issued to the VALU per normalization unit. + F32-FMA: The total number of fused multiply-add instructions operating on 32-bit + floating-point operands issued to the VALU per normalization unit. + F32-Trans: The total number of transcendental instructions (such as sqrt) operating + on 32-bit floating-point operands issued to the VALU per normalization unit. + F64-ADD: The total number of addition instructions operating on 64-bit floating-point + operands issued to the VALU per normalization unit. + F64-MUL: The total number of multiplication instructions operating on 64-bit floating-point + operands issued to the VALU per normalization unit. + F64-FMA: The total number of fused multiply-add instructions operating on 64-bit + floating-point operands issued to the VALU per normalization unit. + F64-Trans: The total number of transcendental instructions (such as sqrt) operating + on 64-bit floating-point operands issued to the VALU per normalization unit. + Conversion: "The total number of type conversion instructions (such as converting\ + \ data to or from F32\u2194F64) issued to the VALU per normalization unit." + Global/Generic Instr: The total number of global & generic memory instructions + executed on all compute units on the accelerator, per normalization unit. + Global/Generic Read: The total number of global & generic memory read instructions + executed on all compute units on the accelerator, per normalization unit. + Global/Generic Write: The total number of global & generic memory write instructions + executed on all compute units on the accelerator, per normalization unit. + Global/Generic Atomic: The total number of global & generic memory atomic (with + and without return) instructions executed on all compute units on the accelerator, + per normalization unit. + Spill/Stack Instr: The total number of spill/stack memory instructions executed + on all compute units on the accelerator, per normalization unit. + Spill/Stack Read: The total number of spill/stack memory read instructions executed + on all compute units on the accelerator, per normalization unit. + Spill/Stack Write: The total number of spill/stack memory write instructions executed + on all compute units on the accelerator, per normalization unit. + Spill/Stack Atomic: The total number of spill/stack memory atomic (with and without + return) instructions executed on all compute units on the accelerator, per normalization + unit. Typically unused as these memory operations are typically used to implement + thread-local storage. + MFMA-I8: The total number of 8-bit integer MFMA instructions issued per normalization + unit. + MFMA-F8: The total number of 8-bit floating point MFMA instructions issued per + normalization unit. This is supported in AMD Instinct MI300 series and later + only. + MFMA-F16: The total number of 16-bit floating point MFMA instructions issued per + normalization unit. + MFMA-BF16: The total number of 16-bit brain floating point MFMA instructions issued + per normalization unit. + MFMA-F32: The total number of 32-bit floating-point MFMA instructions issued per + normalization unit. + MFMA-F64: The total number of 64-bit floating-point MFMA instructions issued per + normalization unit. + data source: + - metric_table: + id: 1001 + title: Overall Instruction Mix + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + VALU: + avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) + min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) + max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) + unit: (instr + $normUnit) + VMEM: + avg: AVG(((SQ_INSTS_VMEM) / $denom)) + min: MIN(((SQ_INSTS_VMEM) / $denom)) + max: MAX(((SQ_INSTS_VMEM) / $denom)) + unit: (instr + $normUnit) + LDS: + avg: AVG((SQ_INSTS_LDS / $denom)) + min: MIN((SQ_INSTS_LDS / $denom)) + max: MAX((SQ_INSTS_LDS / $denom)) + unit: (instr + $normUnit) + MFMA: + avg: AVG((SQ_INSTS_MFMA / $denom)) + min: MIN((SQ_INSTS_MFMA / $denom)) + max: MAX((SQ_INSTS_MFMA / $denom)) + unit: (instr + $normUnit) + SALU: + avg: AVG((SQ_INSTS_SALU / $denom)) + min: MIN((SQ_INSTS_SALU / $denom)) + max: MAX((SQ_INSTS_SALU / $denom)) + unit: (instr + $normUnit) + SMEM: + avg: AVG((SQ_INSTS_SMEM / $denom)) + min: MIN((SQ_INSTS_SMEM / $denom)) + max: MAX((SQ_INSTS_SMEM / $denom)) + unit: (instr + $normUnit) + Branch: + avg: AVG((SQ_INSTS_BRANCH / $denom)) + min: MIN((SQ_INSTS_BRANCH / $denom)) + max: MAX((SQ_INSTS_BRANCH / $denom)) + unit: (instr + $normUnit) + - metric_table: + id: 1002 + title: VALU Arithmetic Instruction Mix + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + INT32: + avg: AVG((SQ_INSTS_VALU_INT32 / $denom)) + min: MIN((SQ_INSTS_VALU_INT32 / $denom)) + max: MAX((SQ_INSTS_VALU_INT32 / $denom)) + unit: (instr + $normUnit) + INT64: + avg: AVG((SQ_INSTS_VALU_INT64 / $denom)) + min: MIN((SQ_INSTS_VALU_INT64 / $denom)) + max: MAX((SQ_INSTS_VALU_INT64 / $denom)) + unit: (instr + $normUnit) + F16-ADD: + avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom)) + unit: (instr + $normUnit) + F16-MUL: + avg: AVG((SQ_INSTS_VALU_MUL_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_MUL_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_MUL_F16 / $denom)) + unit: (instr + $normUnit) + F16-FMA: + avg: AVG((SQ_INSTS_VALU_FMA_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_FMA_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_FMA_F16 / $denom)) + unit: (instr + $normUnit) + F16-Trans: + avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom)) + unit: (instr + $normUnit) + F32-ADD: + avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom)) + unit: (instr + $normUnit) + F32-MUL: + avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom)) + unit: (instr + $normUnit) + F32-FMA: + avg: AVG((SQ_INSTS_VALU_FMA_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_FMA_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_FMA_F32 / $denom)) + unit: (instr + $normUnit) + F32-Trans: + avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom)) + unit: (instr + $normUnit) + F64-ADD: + avg: AVG((SQ_INSTS_VALU_ADD_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_ADD_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_ADD_F64 / $denom)) + unit: (instr + $normUnit) + F64-MUL: + avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom)) + unit: (instr + $normUnit) + F64-FMA: + avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom)) + unit: (instr + $normUnit) + F64-Trans: + avg: AVG((SQ_INSTS_VALU_TRANS_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_TRANS_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_TRANS_F64 / $denom)) + unit: (instr + $normUnit) + Conversion: + avg: AVG((SQ_INSTS_VALU_CVT / $denom)) + min: MIN((SQ_INSTS_VALU_CVT / $denom)) + max: MAX((SQ_INSTS_VALU_CVT / $denom)) + unit: (instr + $normUnit) + - metric_table: + id: 1003 + title: VMEM Instruction Mix + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Global/Generic Instr: + avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Global/Generic Read: + avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Global/Generic Write: + avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Global/Generic Atomic: + avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Instr: + avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Read: + avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Write: + avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Atomic: + avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + - metric_table: + id: 1004 + title: MFMA Arithmetic Instruction Mix + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + MFMA-I8: + avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom)) + unit: (instr + $normUnit) + MFMA-F8: + avg: AVG((SQ_INSTS_VALU_MFMA_F8 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F8 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F8 / $denom)) + unit: (instr + $normUnit) + MFMA-F16: + avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom)) + unit: (instr + $normUnit) + MFMA-BF16: + avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + unit: (instr + $normUnit) + MFMA-F32: + avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom)) + unit: (instr + $normUnit) + MFMA-F64: + avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom)) + unit: (instr + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1100_compute-unit-compute-pipeline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1100_compute-unit-compute-pipeline.yaml deleted file mode 100644 index 3821a9d879..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1100_compute-unit-compute-pipeline.yaml +++ /dev/null @@ -1,273 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1100 - title: Compute Units - Compute Pipeline - data source: - - metric_table: - id: 1101 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - peak: Peak - pop: Pct of Peak - tips: Tips - metric: - VALU FLOPs: - value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) - + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) - + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + (64 * (((SQ_INSTS_VALU_ADD_F64 - + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (2 * SQ_INSTS_VALU_FMA_F64)))) - / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) / (((($max_sclk - * $cu_per_gpu) * 64) * 2) / 1000)) - tips: - VALU IOPs: - value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - Start_Timestamp))) - unit: GIOP - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) - tips: - MFMA FLOPs (F8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - tips: - MFMA FLOPs (BF16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) - tips: - MFMA FLOPs (F16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) - tips: - MFMA FLOPs (F32): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - tips: - MFMA FLOPs (F64): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - tips: - MFMA IOPs (INT8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GIOP - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - tips: - - - metric_table: - id: 1102 - title: Pipeline Stats - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - IPC: - avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - unit: Instr/cycle - tips: - IPC (Issued): - avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) - + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) - / SQ_ACTIVE_INST_ANY)) - min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) - + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) - / SQ_ACTIVE_INST_ANY)) - max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) - + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) - / SQ_ACTIVE_INST_ANY)) - unit: Instr/cycle - tips: - SALU Utilization: - avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - tips: - VALU Utilization: - avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - tips: - VMEM Utilization: - avg: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - tips: - Branch Utilization: - avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - tips: - VALU Active Threads: - avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - unit: Threads - tips: - MFMA Utilization: - avg: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) - min: MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) - max: MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) - unit: pct - tips: - MFMA Instr Cycles: - avg: AVG(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA != 0) - else None)) - min: MIN(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA != 0) - else None)) - max: MAX(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA != 0) - else None)) - unit: cycles/instr - tips: - VMEM Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) - else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) - else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) - else None)) - unit: Cycles - coll_level: SQ_INST_LEVEL_VMEM - tips: - SMEM Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) - else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) - else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) - else None)) - unit: Cycles - coll_level: SQ_INST_LEVEL_SMEM - tips: - - - metric_table: - id: 1103 - title: Arithmetic Operations - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - FLOPs (Total): - avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) - + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 - * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) - + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / - $denom)) - min: MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) - + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 - * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) - + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / - $denom)) - max: MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) - + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 - * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) - + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / - $denom)) - unit: (OPs + $normUnit) - tips: - IOPs (Total): - avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / $denom) - min: MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / $denom) - max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / $denom) - unit: (OPs + $normUnit) - tips: - F8 OPs: - avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) - min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) - max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) - unit: (OPs + $normUnit) - tips: - F16 OPs: - avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + - (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 * - SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) - min: MIN(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + - (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 * - SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) - max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + - (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 * - SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) - unit: (OPs + $normUnit) - tips: - BF16 OPs: - avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) - min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) - max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) - unit: (OPs + $normUnit) - tips: - F32 OPs: - avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) / $denom)) - min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) / $denom)) - max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) / $denom)) - unit: (OPs + $normUnit) - tips: - F64 OPs: - avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) - min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) - max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) - unit: (OPs + $normUnit) - tips: - INT8 OPs: - avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) - min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) - max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) - unit: (OPs + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1100_compute_units_compute_pipeline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1100_compute_units_compute_pipeline.yaml new file mode 100644 index 0000000000..5285c6b279 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1100_compute_units_compute_pipeline.yaml @@ -0,0 +1,330 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1100 + title: Compute Units - Compute Pipeline + metrics_description: + VALU FLOPs: 'The total floating-point operations executed per second on the VALU. + This is also presented as a percent of the peak theoretical FLOPs achievable + on the specific accelerator. Note: this does not include any floating-point + operations from MFMA instructions.' + VALU IOPs: 'The total integer operations executed per second on the VALU. This + is also presented as a percent of the peak theoretical IOPs achievable on the + specific accelerator. Note: this does not include any integer operations from + MFMA instructions.' + MFMA FLOPs (BF16): 'The total number of 16-bit brain floating point MFMA operations + executed per second. Note: this does not include any 16-bit brain floating point + operations from VALU instructions. This is also presented as a percent of the + peak theoretical BF16 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F16): 'The total number of 16-bit floating point MFMA operations executed + per second. Note: this does not include any 16-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F16 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F32): 'The total number of 32-bit floating point MFMA operations executed + per second. Note: this does not include any 32-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F32 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F64): 'The total number of 64-bit floating point MFMA operations executed + per second. Note: this does not include any 64-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F64 MFMA operations achievable on the specific accelerator.' + MFMA IOPs (INT8): 'The total number of 8-bit integer MFMA operations executed + per second. Note: this does not include any 8-bit integer operations from VALU + instructions. This is also presented as a percent of the peak theoretical INT8 + MFMA operations achievable on the specific accelerator.' + IPC: The ratio of the total number of instructions executed on the CU over the + total active CU cycles. + IPC (Issued): The ratio of the total number of (non-internal) instructions issued + over the number of cycles where the scheduler was actively working on issuing + instructions. + SALU Utilization: Indicates what percent of the kernel's duration the SALU was + busy executing instructions. Computed as the ratio of the total number of cycles + spent by the scheduler issuing SALU / SMEM instructions over the total CU cycles. + VALU Utilization: Indicates what percent of the kernel's duration the VALU was + busy executing instructions. Does not include VMEM operations. Computed as the + ratio of the total number of cycles spent by the scheduler issuing VALU instructions + over the total CU cycles. + VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit + was busy executing instructions, including both global/generic and spill/scratch + operations (see the VMEM instruction count metrics for more detail). Does not + include VALU operations. Computed as the ratio of the total number of cycles + spent by the scheduler issuing VMEM instructions over the total CU cycles. + Branch Utilization: Indicates what percent of the kernel's duration the branch + unit was busy executing instructions. Computed as the ratio of the total number + of cycles spent by the scheduler issuing branch instructions over the total + CU cycles. + VALU Active Threads: Indicates the average level of divergence within a wavefront + over the lifetime of the kernel. The number of work-items that were active in + a wavefront during execution of each VALU instruction, time-averaged over all + VALU instructions run on all wavefronts in the kernel + MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit + was busy executing instructions. Computed as the ratio of the total number of + cycles spent by the MFMA was busy over the total CU cycles. + MFMA Instruction Cycles: The average duration of MFMA instructions in this kernel + in cycles. Computed as the ratio of the total number of cycles the MFMA unit + was busy over the total number of MFMA instructions. + VMEM Latency: The average number of round-trip cycles (that is, from issue to + data return / acknowledgment) required for a VMEM instruction to complete. + SMEM Latency: The average number of round-trip cycles (that is, from issue to + data return / acknowledgment) required for a SMEM instruction to complete. + FLOPs (Total): The total number of floating-point operations executed on either + the VALU or MFMA units, per normalization unit. + IOPs (Total): The total number of integer operations executed on either the VALU + or MFMA units, per normalization unit. + F16 OPs: The total number of 16-bit floating-point operations executed on either + the VALU or MFMA units, per normalization unit. + BF16 OPs: The total number of 16-bit brain floating-point operations executed + on either the VALU or MFMA units, per normalization unit. + F32 OPs: The total number of 32-bit floating-point operations executed on either + the VALU or MFMA units, per normalization unit. + F64 OPs: The total number of 64-bit floating-point operations executed on either + the VALU or MFMA units, per normalization unit. + INT8 OPs: The total number of 8-bit integer operations executed on either the + VALU or MFMA units, per normalization unit. + data source: + - metric_table: + id: 1101 + title: Compute Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + peak: Peak + pop: Pct of Peak + metric: + VALU FLOPs: + value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) + / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) + VALU IOPs: + value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp + - Start_Timestamp))) + unit: GIOP + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp + - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) + MFMA FLOPs (F8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + MFMA FLOPs (BF16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) + MFMA FLOPs (F16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) + MFMA FLOPs (F32): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + MFMA FLOPs (F64): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + MFMA IOPs (INT8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GIOP + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + - metric_table: + id: 1102 + title: Pipeline Statistics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + IPC: + avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + unit: Instr/cycle + IPC (Issued): + avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) + / SQ_ACTIVE_INST_ANY)) + min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) + / SQ_ACTIVE_INST_ANY)) + max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) + / SQ_ACTIVE_INST_ANY)) + unit: Instr/cycle + SALU Utilization: + avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + VALU Utilization: + avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + VMEM Utilization: + avg: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + min: MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + max: MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + unit: pct + Branch Utilization: + avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + min: MIN((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + max: MAX((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + VALU Active Threads: + avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + unit: Threads + MFMA Utilization: + avg: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) + min: MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) + max: MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) + unit: pct + MFMA Instruction Cycles: + avg: AVG(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA != + 0) else None)) + min: MIN(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA != + 0) else None)) + max: MAX(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA != + 0) else None)) + unit: cycles/instr + VMEM Latency: + avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) + else None)) + min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) + else None)) + max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) + else None)) + unit: Cycles + coll_level: SQ_INST_LEVEL_VMEM + SMEM Latency: + avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) + else None)) + min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) + else None)) + max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) + else None)) + unit: Cycles + coll_level: SQ_INST_LEVEL_SMEM + - metric_table: + id: 1103 + title: Arithmetic Operations + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + FLOPs (Total): + avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + min: MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + max: MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + unit: (OPs + $normUnit) + IOPs (Total): + avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 + * 512)) / $denom) + min: MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 + * 512)) / $denom) + max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 + * 512)) / $denom) + unit: (OPs + $normUnit) + F8 OPs: + avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) + min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) + max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) + unit: (OPs + $normUnit) + F16 OPs: + avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 + * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) + min: MIN(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 + * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) + max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 + * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) + unit: (OPs + $normUnit) + BF16 OPs: + avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) + min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) + max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) + unit: (OPs + $normUnit) + F32 OPs: + avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + / $denom)) + min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + / $denom)) + max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + / $denom)) + unit: (OPs + $normUnit) + F64 OPs: + avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + unit: (OPs + $normUnit) + INT8 OPs: + avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + unit: (OPs + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1200_lds.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1200_lds.yaml deleted file mode 100644 index c687e7c471..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1200_lds.yaml +++ /dev/null @@ -1,118 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1200 - title: Local Data Share (LDS) - data source: - - metric_table: - id: 1201 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - tips: Tips - metric: - Utilization: - value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: Pct of Peak - tips: - Access Rate: - value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: Pct of Peak - tips: - Theoretical Bandwidth (% of Peak): - value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * 0.00128))) - unit: Pct of Peak - tips: - Bank Conflict Rate: - value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - unit: Pct of Peak - tips: - comparable: false # for now - cli_style: simple_bar - - - metric_table: - id: 1202 - title: LDS Stats - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - LDS Instrs: - avg: AVG((SQ_INSTS_LDS / $denom)) - min: MIN((SQ_INSTS_LDS / $denom)) - max: MAX((SQ_INSTS_LDS / $denom)) - unit: (Instr + $normUnit) - tips: - Theoretical Bandwidth: - avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / $denom)) - min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / $denom)) - max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / $denom)) - unit: (Bytes + $normUnit) - tips: - LDS Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)) - unit: Cycles - coll_level: SQ_INST_LEVEL_LDS - tips: - Bank Conflicts/Access: - avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - unit: Conflicts/Access - tips: - Index Accesses: - avg: AVG((SQ_LDS_IDX_ACTIVE / $denom)) - min: MIN((SQ_LDS_IDX_ACTIVE / $denom)) - max: MAX((SQ_LDS_IDX_ACTIVE / $denom)) - unit: (Cycles + $normUnit) - tips: - Atomic Return Cycles: - avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom)) - min: MIN((SQ_LDS_ATOMIC_RETURN / $denom)) - max: MAX((SQ_LDS_ATOMIC_RETURN / $denom)) - unit: (Cycles + $normUnit) - tips: - Bank Conflict: - avg: AVG((SQ_LDS_BANK_CONFLICT / $denom)) - min: MIN((SQ_LDS_BANK_CONFLICT / $denom)) - max: MAX((SQ_LDS_BANK_CONFLICT / $denom)) - unit: (Cycles + $normUnit) - tips: - Addr Conflict: - avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom)) - min: MIN((SQ_LDS_ADDR_CONFLICT / $denom)) - max: MAX((SQ_LDS_ADDR_CONFLICT / $denom)) - unit: (Cycles + $normUnit) - tips: - Unaligned Stall: - avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom)) - min: MIN((SQ_LDS_UNALIGNED_STALL / $denom)) - max: MAX((SQ_LDS_UNALIGNED_STALL / $denom)) - unit: (Cycles + $normUnit) - tips: - Mem Violations: - avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom)) - min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom)) - max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom)) - unit: (Accesses + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1200_local_data_share_lds.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1200_local_data_share_lds.yaml new file mode 100644 index 0000000000..c1a8525348 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1200_local_data_share_lds.yaml @@ -0,0 +1,141 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1200 + title: Local Data Share (LDS) + metrics_description: + Utilization: Indicates what percent of the kernel's duration the LDS was actively + executing instructions (including, but not limited to, load, store, atomic and + HIP's __shfl operations). Calculated as the ratio of the total number of cycles + LDS was active over the total CU cycles. + Access Rate: Indicates the percentage of SIMDs in the VALU actively issuing LDS + instructions, averaged over the lifetime of the kernel. Calculated as the ratio + of the total number of cycles spent by the scheduler issuing LDS instructions + over the total CU cycles. + Theoretical Bandwidth: Indicates the maximum amount of bytes that could have been + loaded from, stored to, or atomically updated in the LDS per normalization unit. + Does not take into account the execution mask of the wavefront when the instruction + was executed. + Bank Conflict Rate: Indicates the percentage of active LDS cycles that were spent + servicing bank conflicts. Calculated as the ratio of LDS cycles spent servicing + bank conflicts over the number of LDS cycles that would have been required to + move the same amount of data in an uncontended access. + LDS Instructions: The total number of LDS instructions (including, but not limited + to, read/write/atomics and HIP's __shfl instructions) executed per normalization + unit. + LDS Latency: The average number of round-trip cycles (i.e., from issue to data-return + / acknowledgment) required for an LDS instruction to complete. + Bank Conflicts/Access: The ratio of the number of cycles spent in the LDS scheduler + due to bank conflicts (as determined by the conflict resolution hardware) to + the base number of cycles that would be spent in the LDS scheduler in a completely + uncontended case. This is the unnormalized form of the Bank Conflict Rate. + Index Accesses: The total number of cycles spent in the LDS scheduler over all + operations per normalization unit. + Atomic Return Cycles: The total number of cycles spent on LDS atomics with return + per normalization unit. + Bank Conflict: The total number of cycles spent in the LDS scheduler due to bank + conflicts (as determined by the conflict resolution hardware) per normalization + unit. + Addr Conflict: The total number of cycles spent in the LDS scheduler due to address + conflicts (as determined by the conflict resolution hardware) per normalization + unit. + Unaligned Stall: The total number of cycles spent in the LDS scheduler due to + stalls from non-dword aligned addresses per normalization unit. + Mem Violations: "The total number of out-of-bounds accesses made to the LDS, per\ + \ normalization unit. This is unused and expected to be zero in most configurations\ + \ for modern CDNA\u2122 accelerators." + data source: + - metric_table: + id: 1201 + title: LDS Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + metric: + Utilization: + value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: Pct of Peak + Access Rate: + value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: Pct of Peak + Theoretical Bandwidth (% of Peak): + value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * 0.00128))) + unit: Pct of Peak + Bank Conflict Rate: + value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + unit: Pct of Peak + comparable: false + cli_style: simple_bar + tui_style: simple_bar + - metric_table: + id: 1202 + title: LDS Statistics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + LDS Instructions: + avg: AVG((SQ_INSTS_LDS / $denom)) + min: MIN((SQ_INSTS_LDS / $denom)) + max: MAX((SQ_INSTS_LDS / $denom)) + unit: (Instr + $normUnit) + Theoretical Bandwidth: + avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / $denom)) + min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / $denom)) + max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / $denom)) + unit: (Bytes + $normUnit) + LDS Latency: + avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else + None)) + min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else + None)) + max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else + None)) + unit: Cycles + coll_level: SQ_INST_LEVEL_LDS + Bank Conflicts/Access: + avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + unit: Conflicts/Access + Index Accesses: + avg: AVG((SQ_LDS_IDX_ACTIVE / $denom)) + min: MIN((SQ_LDS_IDX_ACTIVE / $denom)) + max: MAX((SQ_LDS_IDX_ACTIVE / $denom)) + unit: (Cycles + $normUnit) + Atomic Return Cycles: + avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom)) + min: MIN((SQ_LDS_ATOMIC_RETURN / $denom)) + max: MAX((SQ_LDS_ATOMIC_RETURN / $denom)) + unit: (Cycles + $normUnit) + Bank Conflict: + avg: AVG((SQ_LDS_BANK_CONFLICT / $denom)) + min: MIN((SQ_LDS_BANK_CONFLICT / $denom)) + max: MAX((SQ_LDS_BANK_CONFLICT / $denom)) + unit: (Cycles + $normUnit) + Addr Conflict: + avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom)) + min: MIN((SQ_LDS_ADDR_CONFLICT / $denom)) + max: MAX((SQ_LDS_ADDR_CONFLICT / $denom)) + unit: (Cycles + $normUnit) + Unaligned Stall: + avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom)) + min: MIN((SQ_LDS_UNALIGNED_STALL / $denom)) + max: MAX((SQ_LDS_UNALIGNED_STALL / $denom)) + unit: (Cycles + $normUnit) + Mem Violations: + avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom)) + min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom)) + max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom)) + unit: (Accesses + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1300_instruction-cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1300_instruction-cache.yaml deleted file mode 100644 index 209a42726e..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1300_instruction-cache.yaml +++ /dev/null @@ -1,105 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1300 - title: Instruction Cache - data source: - - metric_table: - id: 1301 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - tips: Tips - metric: - Bandwidth: - value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) - * (End_Timestamp - Start_Timestamp)))) - unit: Pct of Peak - tips: - Cache Hit Rate: - value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - unit: Pct of Peak - tips: - L1I-L2 Bandwidth: - value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu) - * (End_Timestamp - Start_Timestamp)))) - unit: Pct of Peak - tips: - comparable: false # for now - cli_style: simple_bar - - - metric_table: - id: 1302 - title: Instruction Cache Accesses - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Req: - avg: AVG((SQC_ICACHE_REQ / $denom)) - min: MIN((SQC_ICACHE_REQ / $denom)) - max: MAX((SQC_ICACHE_REQ / $denom)) - unit: (Req + $normUnit) - tips: - Hits: - avg: AVG((SQC_ICACHE_HITS / $denom)) - min: MIN((SQC_ICACHE_HITS / $denom)) - max: MAX((SQC_ICACHE_HITS / $denom)) - unit: (Hits + $normUnit) - tips: - Misses - Non Duplicated: - avg: AVG((SQC_ICACHE_MISSES / $denom)) - min: MIN((SQC_ICACHE_MISSES / $denom)) - max: MAX((SQC_ICACHE_MISSES / $denom)) - unit: (Misses + $normUnit) - tips: - Misses - Duplicated: - avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom)) - min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom)) - max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom)) - unit: (Misses + $normUnit) - tips: - Cache Hit Rate: - avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - unit: pct - tips: - Instruction Fetch Latency: - avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - unit: Cycles - coll_level: SQ_IFETCH_LEVEL - tips: - - metric_table: - id: 1303 - title: Instruction Cache - L2 Interface - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - L1I-L2 Bandwidth: - avg: AVG(((SQC_TC_INST_REQ * 64) / $denom)) - min: MIN(((SQC_TC_INST_REQ * 64) / $denom)) - max: MAX(((SQC_TC_INST_REQ * 64) / $denom)) - unit: (Bytes + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1300_instruction_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1300_instruction_cache.yaml new file mode 100644 index 0000000000..a53c23691f --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1300_instruction_cache.yaml @@ -0,0 +1,106 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1300 + title: Instruction Cache + metrics_description: + Bandwidth: The number of bytes looked up in the L1I cache, as a percent of the + peak theoretical bandwidth. Calculated as the ratio of L1I requests over the + total L1I cycles. + Cache Hit Rate: The percent of L1I requests that hit [#l1i-cache]_ on a previously + loaded line the cache. Calculated as the ratio of the number of L1I requests + that hit over the number of all L1I requests. + L1I-L2 Bandwidth: "The percent of the peak theoretical L1I \u2192 L2 cache request\ + \ bandwidth achieved. Calculated as the ratio of the total number of requests\ + \ from the L1I to the L2 cache over the total L1I-L2 interface cycles." + Req: The total number of requests made to the L1I per normalization-unit + Hits: The total number of L1I requests that hit on a previously loaded cache line, + per normalization-unit. + Misses - Non Duplicated: The total number of L1I requests that missed on a cache + line that were not already pending due to another request, per normalization-unit. + Misses - Duplicated: The total number of L1I requests that missed on a cache line + that were already pending due to another request, per normalization-unit. + Instruction Fetch Latency: The average number of cycles spent to fetch instructions + to a CU. + data source: + - metric_table: + id: 1301 + title: L1I Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + metric: + Bandwidth: + value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * (End_Timestamp + - Start_Timestamp)))) + unit: Pct of Peak + Cache Hit Rate: + value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + unit: Pct of Peak + L1I-L2 Bandwidth: + value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu) + * (End_Timestamp - Start_Timestamp)))) + unit: Pct of Peak + comparable: false + cli_style: simple_bar + tui_style: simple_bar + - metric_table: + id: 1302 + title: L1I cache accesses + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Req: + avg: AVG((SQC_ICACHE_REQ / $denom)) + min: MIN((SQC_ICACHE_REQ / $denom)) + max: MAX((SQC_ICACHE_REQ / $denom)) + unit: (Req + $normUnit) + Hits: + avg: AVG((SQC_ICACHE_HITS / $denom)) + min: MIN((SQC_ICACHE_HITS / $denom)) + max: MAX((SQC_ICACHE_HITS / $denom)) + unit: (Hits + $normUnit) + Misses - Non Duplicated: + avg: AVG((SQC_ICACHE_MISSES / $denom)) + min: MIN((SQC_ICACHE_MISSES / $denom)) + max: MAX((SQC_ICACHE_MISSES / $denom)) + unit: (Misses + $normUnit) + Misses - Duplicated: + avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom)) + min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom)) + max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom)) + unit: (Misses + $normUnit) + Cache Hit Rate: + avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + unit: pct + Instruction Fetch Latency: + avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + unit: Cycles + coll_level: SQ_IFETCH_LEVEL + - metric_table: + id: 1303 + title: L1I <-> L2 interface + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + L1I-L2 Bandwidth: + avg: AVG(((SQC_TC_INST_REQ * 64) / $denom)) + min: MIN(((SQC_TC_INST_REQ * 64) / $denom)) + max: MAX(((SQC_TC_INST_REQ * 64) / $denom)) + unit: (Bytes + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1400_constant-cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1400_constant-cache.yaml deleted file mode 100644 index 669a5834b9..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1400_constant-cache.yaml +++ /dev/null @@ -1,171 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1400 - title: Scalar L1 Data Cache - data source: - - metric_table: - id: 1401 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - tips: Tips - metric: - Bandwidth: - value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) - * (End_Timestamp - Start_Timestamp)))) - unit: Pct of Peak - tips: - Cache Hit Rate: - value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES + SQC_DCACHE_MISSES_DUPLICATE)) - if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - unit: Pct of Peak - tips: - sL1D-L2 BW: - value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 100000) - / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp))) - unit: Pct of Peak - tips: - comparable: false # for now - cli_style: simple_bar - - - metric_table: - id: 1402 - title: Scalar L1D Cache Accesses - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Req: - avg: AVG((SQC_DCACHE_REQ / $denom)) - min: MIN((SQC_DCACHE_REQ / $denom)) - max: MAX((SQC_DCACHE_REQ / $denom)) - unit: (Req + $normUnit) - tips: - Hits: - avg: AVG((SQC_DCACHE_HITS / $denom)) - min: MIN((SQC_DCACHE_HITS / $denom)) - max: MAX((SQC_DCACHE_HITS / $denom)) - unit: (Req + $normUnit) - tips: - Misses - Non Duplicated: - avg: AVG((SQC_DCACHE_MISSES / $denom)) - min: MIN((SQC_DCACHE_MISSES / $denom)) - max: MAX((SQC_DCACHE_MISSES / $denom)) - unit: (Req + $normUnit) - tips: - Misses- Duplicated: - avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom)) - min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom)) - max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom)) - unit: (Req + $normUnit) - tips: - Cache Hit Rate: - avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - unit: pct - tips: - Read Req (Total): - avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) - + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) - min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) - + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) - max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) - + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) - unit: (Req + $normUnit) - tips: - Atomic Req: - avg: AVG((SQC_DCACHE_ATOMIC / $denom)) - min: MIN((SQC_DCACHE_ATOMIC / $denom)) - max: MAX((SQC_DCACHE_ATOMIC / $denom)) - unit: (Req + $normUnit) - tips: - Read Req (1 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_1 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_1 / $denom)) - unit: (Req + $normUnit) - tips: - Read Req (2 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_2 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_2 / $denom)) - unit: (Req + $normUnit) - tips: - Read Req (4 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_4 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_4 / $denom)) - unit: (Req + $normUnit) - tips: - Read Req (8 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_8 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_8 / $denom)) - unit: (Req + $normUnit) - tips: - Read Req (16 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_16 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_16 / $denom)) - unit: (Req + $normUnit) - tips: - - - metric_table: - id: 1403 - title: Scalar L1D Cache - L2 Interface - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - sL1D-L2 BW: - avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom)) - min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom)) - max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom)) - unit: (Bytes + $normUnit) - tips: - Read Req: - avg: AVG((SQC_TC_DATA_READ_REQ / $denom)) - min: MIN((SQC_TC_DATA_READ_REQ / $denom)) - max: MAX((SQC_TC_DATA_READ_REQ / $denom)) - unit: (Req + $normUnit) - tips: - Write Req: - avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom)) - min: MIN((SQC_TC_DATA_WRITE_REQ / $denom)) - max: MAX((SQC_TC_DATA_WRITE_REQ / $denom)) - unit: (Req + $normUnit) - tips: - Atomic Req: - avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)) - min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom)) - max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom)) - unit: (Req + $normUnit) - tips: - Stall Cycles: - avg: AVG((SQC_TC_STALL / $denom)) - min: MIN((SQC_TC_STALL / $denom)) - max: MAX((SQC_TC_STALL / $denom)) - unit: (Cycles + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1400_scalar_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1400_scalar_l1_data_cache.yaml new file mode 100644 index 0000000000..d43157ce8e --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1400_scalar_l1_data_cache.yaml @@ -0,0 +1,186 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1400 + title: Scalar L1 Data Cache + metrics_description: + Bandwidth: The number of bytes looked up in the sL1D cache, as a percent of the + peak theoretical bandwidth. Calculated as the ratio of sL1D requests over the + total sL1D cycles. + Cache Hit Rate: Indicates the percent of sL1D requests that hit on a previously + loaded line the cache. The ratio of the number of sL1D requests that hit over + the number of all sL1D requests. + sL1D-L2 BW: "The total number of bytes read from, written to, or atomically updated\ + \ across the sL1D\u2194L2 interface, per normalization unit. Note that sL1D\ + \ writes and atomics are typically unused on current CDNA accelerators, so in\ + \ the majority of cases this can be interpreted as an sL1D\u2192L2 read bandwidth." + Req: The total number of requests, of any size or type, made to the sL1D per normalization + unit. + Hits: The total number of sL1D requests that hit on a previously loaded cache + line, per normalization unit. + Misses - Non Duplicated: 'The total number of sL1D requests that missed on a cache + line that was not already pending due to another request, per normalization + unit. ' + Misses- Duplicated: The total number of sL1D requests that missed on a cache line + that was already pending due to another request, per normalization unit. + Read Req (Total): The total number of sL1D read requests of any size, per normalization + unit. + Atomic Req: The total number of atomic requests from sL1D to the L2, per normalization + unit. Typically unused on current CDNA accelerators. + Read Req (1 DWord): The total number of sL1D read requests made for a single dword + of data (4B), per normalization unit. + Read Req (2 DWord): The total number of sL1D read requests made for a two dwords + of data (8B), per normalization unit. + Read Req (4 DWord): The total number of sL1D read requests made for a four dwords + of data (16B), per normalization unit. + Read Req (8 DWord): The total number of sL1D read requests made for a eight dwords + of data (32B), per normalization unit. + Read Req (16 DWord): The total number of sL1D read requests made for a sixteen + dwords of data (64B), per normalization unit. + Read Req: The total number of read requests from sL1D to the L2 per normalization + unit. + Write Req: The total number of write requests from sL1D to the L2, per normalization + unit. Typically unused on current CDNA accelerators. + Stall Cycles: "The total number of cycles the sL1D\u2194L2 interface was stalled,\ + \ per normalization unit." + data source: + - metric_table: + id: 1401 + title: Scalar L1D Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + metric: + Bandwidth: + value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * (End_Timestamp + - Start_Timestamp)))) + unit: Pct of Peak + Cache Hit Rate: + value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES + + SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + unit: Pct of Peak + sL1D-L2 BW: + value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp))) + unit: Pct of Peak + comparable: false + cli_style: simple_bar + tui_style: simple_bar + - metric_table: + id: 1402 + title: Scalar L1D cache accesses + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Req: + avg: AVG((SQC_DCACHE_REQ / $denom)) + min: MIN((SQC_DCACHE_REQ / $denom)) + max: MAX((SQC_DCACHE_REQ / $denom)) + unit: (Req + $normUnit) + Hits: + avg: AVG((SQC_DCACHE_HITS / $denom)) + min: MIN((SQC_DCACHE_HITS / $denom)) + max: MAX((SQC_DCACHE_HITS / $denom)) + unit: (Req + $normUnit) + Misses - Non Duplicated: + avg: AVG((SQC_DCACHE_MISSES / $denom)) + min: MIN((SQC_DCACHE_MISSES / $denom)) + max: MAX((SQC_DCACHE_MISSES / $denom)) + unit: (Req + $normUnit) + Misses- Duplicated: + avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom)) + min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom)) + max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom)) + unit: (Req + $normUnit) + Cache Hit Rate: + avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + unit: pct + Read Req (Total): + avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) + min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) + max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG((SQC_DCACHE_ATOMIC / $denom)) + min: MIN((SQC_DCACHE_ATOMIC / $denom)) + max: MAX((SQC_DCACHE_ATOMIC / $denom)) + unit: (Req + $normUnit) + Read Req (1 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_1 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_1 / $denom)) + unit: (Req + $normUnit) + Read Req (2 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_2 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_2 / $denom)) + unit: (Req + $normUnit) + Read Req (4 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_4 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_4 / $denom)) + unit: (Req + $normUnit) + Read Req (8 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_8 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_8 / $denom)) + unit: (Req + $normUnit) + Read Req (16 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_16 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_16 / $denom)) + unit: (Req + $normUnit) + - metric_table: + id: 1403 + title: Scalar L1D Cache - L2 Interface + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + sL1D-L2 BW: + avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 64)) / $denom)) + min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 64)) / $denom)) + max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 64)) / $denom)) + unit: (Bytes + $normUnit) + Read Req: + avg: AVG((SQC_TC_DATA_READ_REQ / $denom)) + min: MIN((SQC_TC_DATA_READ_REQ / $denom)) + max: MAX((SQC_TC_DATA_READ_REQ / $denom)) + unit: (Req + $normUnit) + Write Req: + avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom)) + min: MIN((SQC_TC_DATA_WRITE_REQ / $denom)) + max: MAX((SQC_TC_DATA_WRITE_REQ / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)) + min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom)) + max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom)) + unit: (Req + $normUnit) + Stall Cycles: + avg: AVG((SQC_TC_STALL / $denom)) + min: MIN((SQC_TC_STALL / $denom)) + max: MAX((SQC_TC_STALL / $denom)) + unit: (Cycles + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1500_TA_and_TD.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1500_TA_and_TD.yaml deleted file mode 100644 index 8994d0b17d..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1500_TA_and_TD.yaml +++ /dev/null @@ -1,174 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1500 - title: Address Processing Unit and Data Return Path (TA/TD) - data source: - - metric_table: - id: 1501 - title: Address Processing Unit - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Address Processing Unit Busy: - avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Address Stall: - avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Data Stall: - avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Data-Processor → Address Stall: - avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Total Instructions: - avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom)) - min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom)) - max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Global/Generic Instructions: - avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Global/Generic Read Instructions: - avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Global/Generic Write Instructions: - avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Global/Generic Atomic Instructions: - avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Spill/Stack Instructions: - avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Spill/Stack Read Instructions: - avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Spill/Stack Write Instructions: - avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Spill/Stack Atomic Instructions: - avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Spill/Stack Total Cycles: - avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) - min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) - max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) - unit: (Cycles + $normUnit) - tips: - Spill/Stack Coalesced Read: - avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) - min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) - max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) - unit: (Cycles + $normUnit) - tips: - Spill/Stack Coalesced Write: - avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) - min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) - max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) - unit: (Cycles + $normUnit) - tips: - - - metric_table: - id: 1502 - title: Data-Return Path - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Data-Return Busy: - avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Cache RAM → Data-Return Stall: - avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Workgroup manager → Data-Return Stall: - avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Coalescable Instructions: - avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom)) - min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom)) - max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Read Instructions: - avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) - / $denom)) - min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) - / $denom)) - max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) - / $denom)) - unit: (Instructions + $normUnit) - tips: - Write Instructions: - avg: AVG((TD_STORE_WAVEFRONT_sum / $denom)) - min: MIN((TD_STORE_WAVEFRONT_sum / $denom)) - max: MAX((TD_STORE_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Atomic Instructions: - avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom)) - min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom)) - max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1500_address_processing_unit_and_data_return_path_ta_td.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1500_address_processing_unit_and_data_return_path_ta_td.yaml new file mode 100644 index 0000000000..f920234926 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1500_address_processing_unit_and_data_return_path_ta_td.yaml @@ -0,0 +1,248 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1500 + title: Address Processing Unit and Data Return Path (TA/TD) + metrics_description: + Address Processing Unit Busy: Percent of the total CU cycles the address processor + was busy + Address Stall: Percent of the total CU cycles the address processor was stalled + from sending address requests further into the vL1D pipeline. + Data Stall: Percent of the total CU cycles the address processor was stalled from + sending write/atomic data further into the vL1D pipeline. + "Data-Processor \u2192 Address Stall": Percent of total CU cycles the address + processor was stalled waiting to send command data to the data processor. + Total Instructions: The total number of memory instructions executed by the address + processer over all compute units on the accelerator, per normalization unit. + Global/Generic Instructions: The total number of global & generic memory instructions + executed on all compute units on the accelerator, per normalization unit. + Global/Generic Read Instructions: The total number of global & generic memory + read instructions executed on all compute units on the accelerator, per normalization + unit. + Global/Generic Write Instructions: The total number of global & generic memory + write instructions executed on all compute units on the accelerator, per normalization + unit. + Global/Generic Atomic Instructions: The total number of global & generic memory + atomic (with and without return) instructions executed on all compute units + on the accelerator, per normalization unit. + Spill/Stack Instructions: The total number of spill/stack memory instructions + executed on all compute units on the accelerator, per normalization unit. + Spill/Stack Read Instructions: The total number of spill/stack memory read instructions + executed on all compute units on the accelerator, per normalization unit. + Spill/Stack Write Instructions: The total number of spill/stack memory write instructions + executed on all compute units on the accelerator, per normalization unit. + Spill/Stack Atomic Instructions: The total number of spill/stack memory atomic + (with and without return) instructions executed on all compute units on the + accelerator, per normalization unit. Typically unused as these memory operations + are typically used to implement thread-local storage. + Spill/Stack Total Cycles: The number of cycles the address processing unit spent + working on spill/stack instructions, per normalization unit. + Spill/Stack Coalesced Read: The number of cycles the address processing unit spent + working on coalesced spill/stack read instructions, per normalization unit. + Spill/Stack Coalesced Write: The number of cycles the address processing unit + spent working on coalesced spill/stack write instructions, per normalization + unit. + Data-Return Busy: Percent of the total CU cycles the data-return unit was busy + processing or waiting on data to return to the CU. + "Cache RAM \u2192 Data-Return Stall": Percent of the total CU cycles the data-return + unit was stalled on data to be returned from the vL1D Cache RAM. + "Workgroup manager \u2192 Data-Return Stall": Percent of the total CU cycles the + data-return unit was stalled by the workgroup manager due to initialization + of registers as a part of launching new workgroups. + Coalescable Instructions: The number of instructions submitted to the data-return + unit by the address processor that were found to be coalescable, per normalization + unit. + Read Instructions: The number of read instructions submitted to the data-return + unit by the address processor summed over all compute units on the accelerator, + per normalization unit. This is expected to be the sum of global/generic and + spill/stack reads in the address processor. + Write Instructions: The number of store instructions submitted to the data-return + unit by the address processor summed over all compute units on the accelerator, + per normalization unit. This is expected to be the sum of global/generic and + spill/stack stores in the address processor. + Atomic Instructions: The number of atomic instructions submitted to the data-return + unit by the address processor summed over all compute units on the accelerator, + per normalization unit. This is expected to be the sum of global/generic and + spill/stack atomics in the address processor. + data source: + - metric_table: + id: 1501 + title: Busy and stall metrics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Address Processing Unit Busy: + avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + Address Stall: + avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + unit: pct + Data Stall: + avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + unit: pct + "Data-Processor \u2192 Address Stall": + avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + unit: pct + "Sequencer \u2192 TA Address Stall": + avg: AVG((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) + min: MIN((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) + max: MAX((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) + "Sequencer \u2192 TA Command Stall": + avg: AVG((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) + min: MIN((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) + max: MAX((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) + "Sequencer \u2192 TA Data Stall": + avg: AVG((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) + min: MIN((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) + max: MAX((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) + - metric_table: + id: 1502 + title: Instruction counts + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Total Instructions: + avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom)) + min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom)) + max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Instructions: + avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Read Instructions: + avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Write Instructions: + avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Atomic Instructions: + avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Instructions: + avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Read Instructions: + avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Write Instructions: + avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Atomic Instructions: + avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + - metric_table: + id: 1503 + title: Spill and stack metrics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Spill/Stack Total Cycles: + avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) + min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) + max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) + unit: (Cycles + $normUnit) + Spill/Stack Coalesced Read: + avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) + min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) + max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) + unit: (Cycles + $normUnit) + Spill/Stack Coalesced Write: + avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) + min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) + max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) + unit: (Cycles + $normUnit) + - metric_table: + id: 1504 + title: Vector L1 data-return path or Texture Data (TD) + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Data-Return Busy: + avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + "Cache RAM \u2192 Data-Return Stall": + avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + "Workgroup manager \u2192 Data-Return Stall": + avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + Coalescable Instructions: + avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom)) + min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom)) + max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) + Read Instructions: + avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) + / $denom)) + min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) + / $denom)) + max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) + / $denom)) + unit: (Instructions + $normUnit) + Write Instructions: + avg: AVG((TD_STORE_WAVEFRONT_sum / $denom)) + min: MIN((TD_STORE_WAVEFRONT_sum / $denom)) + max: MAX((TD_STORE_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) + Atomic Instructions: + avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom)) + min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom)) + max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1600_L1_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1600_L1_cache.yaml deleted file mode 100644 index 7fabcfdb47..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1600_L1_cache.yaml +++ /dev/null @@ -1,387 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1600 - title: Vector L1 Data Cache - data source: - - metric_table: - id: 1601 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - tips: Tips - metric: - Hit rate: - value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - unit: Pct of Peak - tips: - Bandwidth: - value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) - unit: Pct of Peak - tips: - Utilization: - value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None)) - unit: Pct of Peak - tips: - Coalescing: - value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum - * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None)) - unit: Pct of Peak - tips: - comparable: false # for now - cli_style: simple_bar - - - metric_table: - id: 1602 - title: L1D Cache Stalls (%) - header: - metric: Metric - expr: Expression - tips: Tips - metric: - Stalled on L2 Data: - expr: - (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None) - tips: - Stalled on L2 Req: - expr: - (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None) - tips: - Stalled on Address: - expr: - None - tips: - Stalled on Data: - expr: - None - tips: - Stalled on Latency FIFO: - expr: - None - tips: - Stalled on Request FIFO: - expr: - None - tips: - Stalled on Read Return: - expr: - None - tips: - Tag RAM Stall (Read): - expr: - (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None) - tips: - Tag RAM Stall (Write): - expr: - (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None) - tips: - Tag RAM Stall (Atomic): - expr: - (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None) - tips: - cli_style: simple_box - - - metric_table: - id: 1603 - title: L1D Cache Accesses - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Total Req: - avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom)) - min: MIN((TCP_TOTAL_ACCESSES_sum / $denom)) - max: MAX((TCP_TOTAL_ACCESSES_sum / $denom)) - unit: (Req + $normUnit) - tips: - Read Req: - avg: AVG((TCP_TOTAL_READ_sum / $denom)) - min: MIN((TCP_TOTAL_READ_sum / $denom)) - max: MAX((TCP_TOTAL_READ_sum / $denom)) - unit: (Req + $normUnit) - tips: - Write Req: - avg: AVG((TCP_TOTAL_WRITE_sum / $denom)) - min: MIN((TCP_TOTAL_WRITE_sum / $denom)) - max: MAX((TCP_TOTAL_WRITE_sum / $denom)) - unit: (Req + $normUnit) - tips: - Atomic Req: - avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)) - min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)) - max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)) - unit: (Req + $normUnit) - tips: - Cache BW: - avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / $denom)) - min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / $denom)) - max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / $denom)) - unit: (Bytes + $normUnit) - tips: - Cache Hit Rate: - avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / - TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / - TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / - TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - unit: pct - tips: - Cache Accesses: - avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) - min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) - max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) - unit: (Req + $normUnit) - tips: - Cache Hits: - avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / $denom)) - min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / $denom)) - max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / $denom)) - unit: (Req + $normUnit) - tips: - Invalidations: - avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) - min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) - max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) - unit: (Req + $normUnit) - tips: - L1-L2 BW: - avg: AVG(((128 * TCP_TCC_READ_REQ_sum + 64 * - (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + - TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) - min: MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * - (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + - TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) - max: MAX(((128 * TCP_TCC_READ_REQ_sum + 64 * - (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + - TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) - unit: (Bytes + $normUnit) - tips: - L1-L2 Read: - avg: AVG((TCP_TCC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - L1-L2 Write: - avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - L1-L2 Atomic: - avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)) - min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)) - max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)) - unit: (Req + $normUnit) - tips: - - - metric_table: - id: 1604 - title: L1D - L2 Transactions - header: - metric: Metric - xfer: Xfer - coherency: Coherency - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - NC - Read: - xfer: Read - coherency: NC - avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - UC - Read: - xfer: Read - coherency: UC - avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - CC - Read: - xfer: Read - coherency: CC - avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - RW - Read: - xfer: Read - coherency: RW - avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - RW - Write: - xfer: Write - coherency: RW - avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - NC - Write: - xfer: Write - coherency: NC - avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - UC - Write: - xfer: Write - coherency: UC - avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - CC - Write: - xfer: Write - coherency: CC - avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - NC - Atomic: - xfer: Atomic - coherency: NC - avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - UC - Atomic: - xfer: Atomic - coherency: UC - avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - CC - Atomic: - xfer: Atomic - coherency: CC - avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - RW - Atomic: - xfer: Atomic - coherency: RW - avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - - - metric_table: - id: 1605 - title: L1D Addr Translation - header: - metric: Metric - avg: Avg - min: Min - max: Max - units: Units - tips: Tips - metric: - Req: - avg: AVG((TCP_UTCL1_REQUEST_sum / $denom)) - min: MIN((TCP_UTCL1_REQUEST_sum / $denom)) - max: MAX((TCP_UTCL1_REQUEST_sum / $denom)) - units: (Req + $normUnit) - tips: - Inflight Req: - avg: None # Missing perfmon - min: None # Missing perfmon - max: None # Missing perfmon - units: (Req + $normUnit) - tips: - Hit Ratio: - avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) if - (TCP_UTCL1_REQUEST_sum != 0) else None)) - min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) if - (TCP_UTCL1_REQUEST_sum != 0) else None)) - max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) if - (TCP_UTCL1_REQUEST_sum != 0) else None)) - units: pct - tips: - Hits: - avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) - min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) - max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) - units: (Req + $normUnit) - tips: - Translation Misses: - avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) - min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) - max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) - units: (Req + $normUnit) - tips: - Permission Misses: - avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) - min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) - max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) - units: (Req + $normUnit) - tips: - - metric_table: - id: 1606 - title: L1D Addr Translation Stalls - header: - metric: Metric - avg: Avg - min: Min - max: Max - units: Units - tips: Tips - metric: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1600_vector_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1600_vector_l1_data_cache.yaml new file mode 100644 index 0000000000..708bbafe14 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1600_vector_l1_data_cache.yaml @@ -0,0 +1,412 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1600 + title: Vector L1 Data Cache + metrics_description: + Hit rate: The ratio of the number of vL1D cache line requests that hit in vL1D + cache over the total number of cache line requests to the vL1D Cache RAM. + Bandwidth: The number of bytes looked up in the vL1D cache as a result of VMEM + instructions, as a percent of the peak theoretical bandwidth achievable on the + specific accelerator. The number of bytes is calculated as the number of cache + lines requested multiplied by the cache line size. This value does not consider + partial requests, so for instance, if only a single value is requested in a + cache line, the data movement will still be counted as a full cache line. + Utilization: Indicates how busy the vL1D Cache RAM was during the kernel execution. + The number of cycles where the vL1D Cache RAM is actively processing any request + divided by the number of cycles where the vL1D is active. + Coalescing: Indicates how well memory instructions were coalesced by the address + processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). Calculated + as the average number of thread-requests generated per instruction divided by + the ideal number of thread-requests per instruction. + Stalled on L2 Data: The ratio of the number of cycles where the vL1D is stalled + waiting for requested data to return from the L2 cache divided by the number + of cycles where the vL1D is active. + Stalled on L2 Req: The ratio of the number of cycles where the vL1D is stalled + waiting to issue a request for data to the L2 cache divided by the number of + cycles where the vL1D is active. + Tag RAM Stall (Read): The ratio of the number of cycles where the vL1D is stalled + due to Read requests with conflicting tags being looked up concurrently, divided + by the number of cycles where the vL1D is active. + Tag RAM Stall (Write): The ratio of the number of cycles where the vL1D is stalled + due to Write requests with conflicting tags being looked up concurrently, divided + by the number of cycles where the vL1D is active. + Tag RAM Stall (Atomic): The ratio of the number of cycles where the vL1D is stalled + due to Atomic requests with conflicting tags being looked up concurrently, divided + by the number of cycles where the vL1D is active. + Total Req: The total number of incoming requests from the address processing unit + after coalescing. + Read Req: The total number of incoming read requests from the address processing + unit after coalescing per normalization unit. + Write Req: The total number of incoming write requests from the address processing + unit after coalescing per normalization unit. + Atomic Req: The total number of incoming atomic requests from the address processing + unit after coalescing per normalization unit. + Cache BW: The number of bytes looked up in the vL1D cache as a result of VMEM + instructions per normalization unit. The number of bytes is calculated as the + number of cache lines requested multiplied by the cache line size. This value + does not consider partial requests, so for instance, if only a single value + is requested in a cache line, the data movement will still be counted as a full + cache line. + Cache Hit Rate: The ratio of the number of vL1D cache line requests that hit in + vL1D cache over the total number of cache line requests to the vL1D Cache RAM. + Cache Accesses: The total number of cache line lookups in the vL1D. + Cache Hits: The number of cache accesses minus the number of outgoing requests + to the L2 cache, that is, the number of cache line requests serviced by the + vL1D Cache RAM per normalization unit. + Invalidations: The number of times the vL1D was issued a write-back invalidate + command during the kernel's execution per normalization unit. This may be triggered + by, for instance, the buffer_wbinvl1 instruction. + L1-L2 BW: The number of bytes transferred across the vL1D-L2 interface as a result + of VMEM instructions, per normalization unit. The number of bytes is calculated + as the number of cache lines requested multiplied by the cache line size. This + value does not consider partial requests, so for instance, if only a single + value is requested in a cache line, the data movement will still be counted + as a full cache line. + L1-L2 Read: The number of read requests for a vL1D cache line that were not satisfied + by the vL1D and must be retrieved from the to the L2 Cache per normalization + unit. + L1-L2 Write: The number of write requests to a vL1D cache line that were sent + through the vL1D to the L2 cache, per normalization unit. + L1-L2 Atomic: The number of atomic requests that are sent through the vL1D to + the L2 cache, per normalization unit. This includes requests for atomics with, + and without return. + L1 Access Latency: Calculated as the average number of cycles that a vL1D cache + line request spent in the vL1D cache pipeline. + L1-L2 Read Latency: Calculated as the average number of cycles that the vL1D cache + took to issue and receive read requests from the L2 Cache. This number also + includes requests for atomics with return values. + L1-L2 Write Latency: Calculated as the average number of cycles that the vL1D + cache took to issue and receive acknowledgement of a write request to the L2 + Cache. This number also includes requests for atomics without return values. + NC - Read: Total read requests with NC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + UC - Read: Total read requests with UC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + CC - Read: Total read requests with CC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + RW - Read: Total read requests with RW mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + RW - Write: Total write requests with RW mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + NC - Write: Total write requests with NC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + UC - Write: Total write requests with UC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + CC - Write: Total write requests with CC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + NC - Atomic: Total atomic requests with NC mtype from this TCP to all TCCs Sum + over TCP instances per normalization unit. + UC - Atomic: Total atomic requests with UC mtype from this TCP to all TCCs Sum + over TCP instances per normalization unit. + CC - Atomic: Total atomic requests with CC mtype from this TCP to all TCCs Sum + over TCP instances per normalization unit. + RW - Atomic: Total atomic requests with RW mtype from this TCP to all TCCs Sum + over TCP instances per normalization unit. + Req: The number of translation requests made to the UTCL1 per normalization unit. + Hit Ratio: The ratio of the number of translation requests that hit in the UTCL1 + divided by the total number of translation requests made to the UTCL1. + Hits: The number of translation requests that hit in the UTCL1, and could be reused, + per normalization unit. + Translation Misses: The total number of translation requests that missed in the + UTCL1 due to translation not being present in the cache, per normalization + unit. + Permission Misses: "The total number of translation requests that missed in the\ + \ UTCL1 due to a permission error, per normalization unit. This is unused and\ + \ expected to be zero in most configurations for modern CDNA\u2122 accelerators." + data source: + - metric_table: + id: 1601 + title: vL1D Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + metric: + Hit rate: + value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None)) + unit: Pct of Peak + Bandwidth: + value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) + unit: Pct of Peak + Utilization: + value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None)) + unit: Pct of Peak + Coalescing: + value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum + * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None)) + unit: Pct of Peak + comparable: false + cli_style: simple_bar + tui_style: simple_bar + - metric_table: + id: 1602 + title: vL1D cache stall metrics + header: + metric: Metric + expr: Expression + metric: + Stalled on L2 Data: + expr: (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None) + Stalled on L2 Req: + expr: (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None) + Tag RAM Stall (Read): + expr: (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + Tag RAM Stall (Write): + expr: (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + Tag RAM Stall (Atomic): + expr: (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1603 + title: vL1D cache access metrics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Total Req: + avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom)) + min: MIN((TCP_TOTAL_ACCESSES_sum / $denom)) + max: MAX((TCP_TOTAL_ACCESSES_sum / $denom)) + unit: (Req + $normUnit) + Read Req: + avg: AVG((TCP_TOTAL_READ_sum / $denom)) + min: MIN((TCP_TOTAL_READ_sum / $denom)) + max: MAX((TCP_TOTAL_READ_sum / $denom)) + unit: (Req + $normUnit) + Write Req: + avg: AVG((TCP_TOTAL_WRITE_sum / $denom)) + min: MIN((TCP_TOTAL_WRITE_sum / $denom)) + max: MAX((TCP_TOTAL_WRITE_sum / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + unit: (Req + $normUnit) + Cache BW: + avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / $denom)) + min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / $denom)) + max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / $denom)) + unit: (Bytes + $normUnit) + Cache Hit Rate: + avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None)) + min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None)) + max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None)) + unit: pct + Cache Accesses: + avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + unit: (Req + $normUnit) + Cache Hits: + avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + unit: (Req + $normUnit) + Invalidations: + avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + unit: (Req + $normUnit) + L1-L2 BW: + avg: AVG(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) + min: MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) + max: MAX(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) + unit: (Bytes + $normUnit) + L1-L2 Read: + avg: AVG((TCP_TCC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + L1-L2 Write: + avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + L1-L2 Atomic: + avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + unit: (Req + $normUnit) + - metric_table: + id: 1604 + title: L1D - L2 Transactions + header: + metric: Metric + xfer: Xfer + coherency: Coherency + avg: Avg + min: Min + max: Max + unit: Unit + metric: + NC - Read: + xfer: Read + coherency: NC + avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC - Read: + xfer: Read + coherency: UC + avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC - Read: + xfer: Read + coherency: CC + avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW - Read: + xfer: Read + coherency: RW + avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW - Write: + xfer: Write + coherency: RW + avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + NC - Write: + xfer: Write + coherency: NC + avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC - Write: + xfer: Write + coherency: UC + avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC - Write: + xfer: Write + coherency: CC + avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + NC - Atomic: + xfer: Atomic + coherency: NC + avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC - Atomic: + xfer: Atomic + coherency: UC + avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC - Atomic: + xfer: Atomic + coherency: CC + avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW - Atomic: + xfer: Atomic + coherency: RW + avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + - metric_table: + id: 1605 + title: L1 Unified Translation Cache (UTCL1) + header: + metric: Metric + avg: Avg + min: Min + max: Max + units: Units + metric: + Req: + avg: AVG((TCP_UTCL1_REQUEST_sum / $denom)) + min: MIN((TCP_UTCL1_REQUEST_sum / $denom)) + max: MAX((TCP_UTCL1_REQUEST_sum / $denom)) + units: (Req + $normUnit) + Hit Ratio: + avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) + if (TCP_UTCL1_REQUEST_sum != 0) else None)) + min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) + if (TCP_UTCL1_REQUEST_sum != 0) else None)) + max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) + if (TCP_UTCL1_REQUEST_sum != 0) else None)) + units: pct + Hits: + avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + units: (Req + $normUnit) + Translation Misses: + avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + units: (Req + $normUnit) + Permission Misses: + avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + units: (Req + $normUnit) + - metric_table: + id: 1606 + title: L1D Addr Translation Stalls + header: + metric: Metric + avg: Avg + min: Min + max: Max + units: Units + metric: {} diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1700_L2_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1700_L2_cache.yaml deleted file mode 100644 index 4476ce7b15..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1700_L2_cache.yaml +++ /dev/null @@ -1,391 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1700 - title: L2 Cache - data source: - - metric_table: - id: 1701 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - tips: Tips - metric: - Utilization: - value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD))) - unit: pct - tips: - Bandwidth: - value: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) - unit: pct - tips: - Hit Rate: - value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else 0)) - unit: pct - tips: - L2-Fabric Read BW: - value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) - * 64)) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - tips: - L2-Fabric Write and Atomic BW: - value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - tips: - HBM Bandwidth: - value: $hbmBandwidth - unit: GB/s - tips: - - - metric_table: - id: 1702 - title: L2 - Fabric Transactions - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Read BW: - avg: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) - * 64)) / $denom)) - min: MIN((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) - * 64)) / $denom)) - max: MAX((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) - * 64)) / $denom)) - unit: (Bytes + $normUnit) - tips: - HBM Read Traffic: - avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - unit: pct - tips: - Remote Read Traffic: - avg: AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - min: MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - max: MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - unit: pct - tips: - Uncached Read Traffic: - avg: AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - unit: pct - tips: - Write and Atomic BW: - avg: - AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / $denom)) - min: - MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / $denom)) - max: - MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / $denom)) - unit: (Bytes + $normUnit) - tips: - HBM Write and Atomic Traffic: - avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - unit: pct - tips: - Remote Write and Atomic Traffic: - avg: AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - min: MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - max: MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - unit: pct - tips: - Atomic Traffic: - avg: AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - unit: pct - tips: - Uncached Write and Atomic Traffic: - avg: AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - unit: pct - tips: - Read Latency: - avg: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != - 0) else None)) - min: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != - 0) else None)) - max: MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != - 0) else None)) - unit: Cycles - tips: - Write and Atomic Latency: - avg: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != - 0) else None)) - min: MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != - 0) else None)) - max: MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != - 0) else None)) - unit: Cycles - tips: - Atomic Latency: - avg: AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum - != 0) else None)) - min: MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum - != 0) else None)) - max: MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum - != 0) else None)) - unit: Cycles - tips: - - - metric_table: - id: 1703 - title: L2 Cache Accesses - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Bandwidth: - avg: AVG((TCC_REQ_sum * 128) / $denom) - min: MIN((TCC_REQ_sum * 128) / $denom) - max: MAX((TCC_REQ_sum * 128) / $denom) - unit: (Bytes + $normUnit) - tips: - Req: - avg: AVG((TCC_REQ_sum / $denom)) - min: MIN((TCC_REQ_sum / $denom)) - max: MAX((TCC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - Read Req: - avg: AVG((TCC_READ_sum / $denom)) - min: MIN((TCC_READ_sum / $denom)) - max: MAX((TCC_READ_sum / $denom)) - unit: (Req + $normUnit) - tips: - Write Req: - avg: AVG((TCC_WRITE_sum / $denom)) - min: MIN((TCC_WRITE_sum / $denom)) - max: MAX((TCC_WRITE_sum / $denom)) - unit: (Req + $normUnit) - tips: - Atomic Req: - avg: AVG((TCC_ATOMIC_sum / $denom)) - min: MIN((TCC_ATOMIC_sum / $denom)) - max: MAX((TCC_ATOMIC_sum / $denom)) - unit: (Req + $normUnit) - tips: - Streaming Req: - avg: AVG((TCC_STREAMING_REQ_sum / $denom)) - min: MIN((TCC_STREAMING_REQ_sum / $denom)) - max: MAX((TCC_STREAMING_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - Probe Req: - avg: AVG((TCC_PROBE_sum / $denom)) - min: MIN((TCC_PROBE_sum / $denom)) - max: MAX((TCC_PROBE_sum / $denom)) - unit: (Req + $normUnit) - tips: - Cache Hit: - avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - unit: pct - tips: - Hits: - avg: AVG((TCC_HIT_sum / $denom)) - min: MIN((TCC_HIT_sum / $denom)) - max: MAX((TCC_HIT_sum / $denom)) - unit: (Hits + $normUnit) - tips: - Misses: - avg: AVG((TCC_MISS_sum / $denom)) - min: MIN((TCC_MISS_sum / $denom)) - max: MAX((TCC_MISS_sum / $denom)) - unit: (Misses + $normUnit) - tips: - Writeback: - avg: AVG((TCC_WRITEBACK_sum / $denom)) - min: MIN((TCC_WRITEBACK_sum / $denom)) - max: MAX((TCC_WRITEBACK_sum / $denom)) - unit: (Cachelines + $normUnit) - tips: - Writeback (Internal): - avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom)) - min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom)) - max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom)) - unit: (Cachelines + $normUnit) - tips: - Writeback (vL1D Req): - avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - unit: (Cachelines + $normUnit) - tips: - Evict (Internal): - avg: AVG((TCC_NORMAL_EVICT_sum / $denom)) - min: MIN((TCC_NORMAL_EVICT_sum / $denom)) - max: MAX((TCC_NORMAL_EVICT_sum / $denom)) - unit: (Cachelines + $normUnit) - tips: - Evict (vL1D Req): - avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - unit: (Cachelines + $normUnit) - tips: - NC Req: - avg: AVG((TCC_NC_REQ_sum / $denom)) - min: MIN((TCC_NC_REQ_sum / $denom)) - max: MAX((TCC_NC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - UC Req: - avg: AVG((TCC_UC_REQ_sum / $denom)) - min: MIN((TCC_UC_REQ_sum / $denom)) - max: MAX((TCC_UC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - CC Req: - avg: AVG((TCC_CC_REQ_sum / $denom)) - min: MIN((TCC_CC_REQ_sum / $denom)) - max: MAX((TCC_CC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - RW Req: - avg: AVG((TCC_RW_REQ_sum / $denom)) - min: MIN((TCC_RW_REQ_sum / $denom)) - max: MAX((TCC_RW_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - - - metric_table: - id: 1704 - title: L2 Cache Stalls - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - - - metric_table: - id: 1705 - title: L2 - Fabric Interface Stalls - header: - metric: Metric - type: Type - transaction: Transaction - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - style: - type: simple_multi_bar - metric: - Write - Credit Starvation: - type: Credit Starvation - transaction: Write - avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - unit: pct - tips: - - - metric_table: - id: 1706 - title: L2 - Fabric Detailed Transaction Breakdown - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Read (32B): - avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom)) - min: MIN((TCC_EA0_RDREQ_32B_sum / $denom)) - max: MAX((TCC_EA0_RDREQ_32B_sum / $denom)) - unit: (Req + $normUnit) - tips: - Read (64B): - avg: AVG(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) - min: MIN(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) - max: MAX(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) - unit: (Req + $normUnit) - tips: - Read (Uncached): - avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) - min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) - max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) - unit: (Req + $normUnit) - tips: - HBM Read: - avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)) - min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom)) - max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom)) - unit: (Req + $normUnit) - tips: - Remote Read: - avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - unit: (Req + $normUnit) - tips: - Write and Atomic (32B): - avg: AVG(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) - min: MIN(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) - max: MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) - unit: (Req + $normUnit) - tips: - Write and Atomic (Uncached): - avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) - min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) - max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) - unit: (Req + $normUnit) - tips: - Write and Atomic (64B): - avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom)) - min: MIN((TCC_EA0_WRREQ_64B_sum / $denom)) - max: MAX((TCC_EA0_WRREQ_64B_sum / $denom)) - unit: (Req + $normUnit) - tips: - HBM Write and Atomic: - avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)) - min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom)) - max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom)) - unit: (Req + $normUnit) - tips: - Remote Write and Atomic: - avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - unit: (Req + $normUnit) - tips: - Atomic: - avg: AVG((TCC_EA0_ATOMIC_sum / $denom)) - min: MIN((TCC_EA0_ATOMIC_sum / $denom)) - max: MAX((TCC_EA0_ATOMIC_sum / $denom)) - unit: (Req + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1700_l2_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1700_l2_cache.yaml new file mode 100644 index 0000000000..c2b82a38ec --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1700_l2_cache.yaml @@ -0,0 +1,536 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1700 + title: L2 Cache + metrics_description: + Utilization: The ratio of the number of cycles an L2 channel was active, summed + over all L2 channels on the accelerator over the total L2 cycles. + Peak Bandwidth: The number of bytes looked up in the L2 cache, as a percent of + the peak theoretical bandwidth achievable on the specific accelerator. The number + of bytes is calculated as the number of cache lines requested multiplied by + the cache line size. This value does not consider partial requests, so e.g., + if only a single value is requested in a cache line, the data movement will + still be counted as a full cache line. + Hit Rate: The ratio of the number of L2 cache line requests that hit in the L2 + cache over the total number of incoming cache line requests to the L2 cache. + L2-Fabric Read BW: The number of bytes read by the L2 over the Infinity Fabric + interface per unit time. + L2-Fabric Write and Atomic BW: The number of bytes sent by the L2 over the Infinity + Fabric interface by write and atomic operations per unit time. + HBM Bandwidth: Maximum theoretical bandwidth of the accelerator's local high-bandwidth + memory (HBM) per unit time. This value is calculated as the number of HBM channels + multiplied by the HBM channel width multiplied by the HBM clock frequency. + Read BW: The total number of bytes read by the L2 cache from Infinity Fabric per + normalization unit. + HBM Read Traffic: The percent of read requests generated by the L2 cache that + are routed to the accelerator's local high-bandwidth memory (HBM). This breakdown + does not consider the size of the request (meaning that 32B and 64B requests + are both counted as a single request), so this metric only approximates the + percent of the L2-Fabric Read bandwidth directed to the local HBM. + Remote Read Traffic: The percent of read requests generated by the L2 cache that + are routed to any memory location other than the accelerator's local high-bandwidth + memory (HBM) - for example, the CPU's DRAM or a remote accelerator's HBM. This + breakdown does not consider the size of the request (meaning that 32B and 64B + requests are both counted as a single request), so this metric only approximates + the percent of the L2-Fabric Read bandwidth directed to a remote location. + Uncached Read Traffic: The percent of read requests generated by the L2 cache + that are reading from an uncached memory allocation. Note, as described in the + request flow section, a single 64B read request is typically counted as two + uncached read requests. So, it is possible for the Uncached Read Traffic to + reach up to 200% of the total number of read requests. This breakdown does not + consider the size of the request (i.e., 32B and 64B requests are both counted + as a single request), so this metric only approximates the percent of the L2-Fabric + read bandwidth directed to an uncached memory location. + Write and Atomic BW: The total number of bytes written by the L2 over Infinity + Fabric by write and atomic operations per normalization unit. Note that on current + CDNA accelerators, such as the MI2XX, requests are only considered atomic by + Infinity Fabric if they are targeted at non-write-cacheable memory, for example, + fine-grained memory allocations or uncached memory allocations on the MI2XX. + HBM Write and Atomic Traffic: The percent of write and atomic requests generated + by the L2 cache that are routed to the accelerator's local high-bandwidth memory + (HBM). This breakdown does not consider the size of the request (meaning that + 32B and 64B requests are both counted as a single request), so this metric only + approximates the percent of the L2-Fabric Write and Atomic bandwidth directed + to the local HBM. Note that on current CDNA accelerators, such as the MI2XX, + requests are only considered atomic by Infinity Fabric if they are targeted + at fine-grained memory allocations or uncached memory allocations. + Remote Write and Atomic Traffic: The percent of read requests generated by the + L2 cache that are routed to any memory location other than the accelerator's + local high-bandwidth memory (HBM) - for example, the CPU's DRAM or a remote + accelerator's HBM. This breakdown does not consider the size of the request + (meaning that 32B and 64B requests are both counted as a single request), so + this metric only approximates the percent of the L2-Fabric Read bandwidth directed + to a remote location. Note that on current CDNA accelerators, such as the MI2XX, + requests are only considered atomic by Infinity Fabric if they are targeted + at fine-grained memory allocations or uncached memory allocations. + Atomic Traffic: The percent of write requests generated by the L2 cache that are + atomic requests to any memory location. This breakdown does not consider the + size of the request (meaning that 32B and 64B requests are both counted as a + single request), so this metric only approximates the percent of the L2-Fabric + Read bandwidth directed to a remote location. Note that on current CDNA accelerators, + such as the MI2XX, requests are only considered atomic by Infinity Fabric if + they are targeted at fine-grained memory allocations or uncached memory allocations. + Uncached Write and Atomic Traffic: The percent of write and atomic requests generated + by the L2 cache that are targeting uncached memory allocations. This breakdown + does not consider the size of the request (meaning that 32B and 64B requests + are both counted as a single request), so this metric only approximates the + percent of the L2-Fabric read bandwidth directed to uncached memory allocations. + Read Latency: The time-averaged number of cycles read requests spent in Infinity + Fabric before data was returned to the L2. + Write and Atomic Latency: The time-averaged number of cycles write requests spent + in Infinity Fabric before a completion acknowledgement was returned to the L2. + Atomic Latency: The time-averaged number of cycles atomic requests spent in Infinity + Fabric before a completion acknowledgement (atomic without return value) or + data (atomic with return value) was returned to the L2. + Bandwidth: The number of bytes looked up in the L2 cache, per normalization unit. + The number of bytes is calculated as the number of cache lines requested multiplied + by the cache line size. This value does not consider partial requests, so for + example, if only a single value is requested in a cache line, the data movement + will still be counted as a full cache line. + Req: The total number of incoming requests to the L2 from all clients for all + request types, per normalization unit. + Read Req: The total number of read requests to the L2 from all clients. + Write Req: The total number of write requests to the L2 from all clients. + Atomic Req: The total number of atomic requests (with and without return) to the + L2 from all clients. + Streaming Req: The total number of incoming requests to the L2 that are marked + as streaming. The exact meaning of this may differ depending on the targeted + accelerator, however on an MI2XX this corresponds to non-temporal load or stores. + The L2 cache attempts to evict streaming requests before normal requests when + the L2 is at capacity. + Probe Req: The number of coherence probe requests made to the L2 cache from outside + the accelerator. On an MI2XX, probe requests may be generated by, for example, + writes to fine-grained device memory or by writes to coarse-grained device memory. + Cache Hit: The ratio of the number of L2 cache line requests that hit in the L2 + cache over the total number of incoming cache line requests to the L2 cache. + Hits: The total number of requests to the L2 from all clients that hit in the + cache. As noted in the Speed-of-Light section, this includes hit-on-miss requests. + Misses: The total number of requests to the L2 from all clients that miss in the + cache. As noted in the Speed-of-Light section, these do not include hit-on-miss + requests. + Writeback: The total number of L2 cache lines written back to memory for any reason. + Write-backs may occur due to user code (such as HIP kernel calls to _threadfence_system + or atomic built-ins) by the command processor's memory acquire/release fences, + or for other internal hardware reasons. + Writeback (Internal): The total number of L2 cache lines written back to memory + for internal hardware reasons, per normalization unit. + Writeback (vL1D Req): The total number of L2 cache lines written back to memory + due to requests initiated by the vL1D cache, per normalization unit. + Evict (Internal): The total number of L2 cache lines evicted from the cache due + to capacity limits, per normalization unit. + Evict (vL1D Req): The total number of L2 cache lines evicted from the cache due + to invalidation requests initiated by the vL1D cache, per normalization unit. + NC Req: The total number of requests to the L2 to Not-hardware-Coherent (NC) memory + allocations, per normalization unit. + UC Req: The total number of requests to the L2 that go to Uncached (UC) memory + allocations. + CC Req: The total number of requests to the L2 that go to Coherently Cacheable + (CC) memory allocations. + RW Req: The total number of requests to the L2 that go to Read-Write coherent + memory (RW) allocations. + Write - Credit Starvation: The number of cycles the L2-Fabric interface was stalled + on write or atomic requests to any memory location because too many write/atomic + requests were currently in flight, as a percent of the total active L2 cycles. + Read (32B): The total number of L2 requests to Infinity Fabric to read 32B of + data from any memory location, per normalization unit. + Read (64B): The total number of L2 requests to Infinity Fabric to read 64B of + data from any memory location, per normalization unit. + Read (Uncached): The total number of L2 requests to Infinity Fabric to read uncached + data from any memory location, per normalization unit. 64B requests for uncached + data are counted as two 32B uncached data requests. + HBM Read: The total number of L2 requests to Infinity Fabric to read 32B or 64B + of data from the accelerator's local HBM, per normalization unit. + Remote Read: The total number of L2 requests to Infinity Fabric to read 32B or + 64B of data from any source other than the accelerator's local HBM, per normalization + unit. + Write and Atomic (32B): The total number of L2 requests to Infinity Fabric to + write or atomically update 32B of data to any memory location, per normalization + unit. + Write and Atomic (Uncached): The total number of L2 requests to Infinity Fabric + to write or atomically update 32B or 64B of uncached data, per normalization + unit. + Write and Atomic (64B): The total number of L2 requests to Infinity Fabric to + write or atomically update 64B of data in any memory location, per normalization + unit. + HBM Write and Atomic: The total number of L2 requests to Infinity Fabric to write + or atomically update 32B or 64B of data in the accelerator's local HBM, per + normalization unit. + Remote Write and Atomic: The total number of L2 requests to Infinity Fabric to + write or atomically update 32B or 64B of data in any memory location other than + the accelerator's local HBM, per normalization unit. + Atomic: The total number of L2 requests to Infinity Fabric to atomically update + 32B or 64B of data in any memory location, per normalization unit. See Request + flow for more detail. Note that on current CDNA accelerators, such as the MI2XX, + requests are only considered atomic by Infinity Fabric if they are targeted + at non-write-cacheable memory, such as fine-grained memory allocations or uncached + memory allocations on the MI2XX. + Read Stall: "The ratio of the total number of cycles the L2-Fabric interface was\ + \ stalled on a read request to any destination (local HBM, remote PCIe\xAE connected\ + \ accelerator or CPU, or remote Infinity Fabric connected accelerator or CPU)\ + \ over the total active L2 cycles." + Write Stall: The ratio of the total number of cycles the L2-Fabric interface was + stalled on a write or atomic request to any destination (local HBM, remote accelerator + or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected + accelerator or CPU) over the total active L2 cycles. + Read - PCIe Stall: The number of cycles the L2-Fabric interface was stalled on + read requests to remote PCIe connected accelerators or CPUs as a percent of + the total active L2 cycles. + Read - Infinity Fabric Stall: The number of cycles the L2-Fabric interface was + stalled on read requests to remote Infinity Fabric connected accelerators or + CPUs as a percent of the total active L2 cycles. + Read - HBM Stall: The number of cycles the L2-Fabric interface was stalled on + read requests to the accelerator's local HBM as a percent of the total active + L2 cycles. + Write - PCIe Stall: The number of cycles the L2-Fabric interface was stalled on + write or atomic requests to remote PCIe connected accelerators or CPUs as a + percent of the total active L2 cycles. + Write - Infinity Fabric Stall: The number of cycles the L2-Fabric interface was + stalled on write or atomic requests to remote Infinity Fabric connected accelerators + or CPUs as a percent of the total active L2 cycles. + Write - HBM Stall: The number of cycles the L2-Fabric interface was stalled on + write or atomic requests to accelerator's local HBM as a percent of the total + active L2 cycles. + data source: + - metric_table: + id: 1701 + title: L2 Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + metric: + Utilization: + value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD))) + unit: pct + Peak Bandwidth: + value: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) + / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) + unit: pct + Hit Rate: + value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else 0)) + unit: pct + L2-Fabric Read BW: + value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) + * 64)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + L2-Fabric Write and Atomic BW: + value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + HBM Bandwidth: + value: $hbmBandwidth + unit: GB/s + - metric_table: + id: 1702 + title: L2-Fabric interface metrics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Read BW: + avg: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) + * 64)) / $denom)) + min: MIN((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) + * 64)) / $denom)) + max: MAX((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) + * 64)) / $denom)) + unit: (Bytes + $normUnit) + HBM Read Traffic: + avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + unit: pct + Remote Read Traffic: + avg: AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / + TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + min: MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / + TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + max: MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / + TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + unit: pct + Uncached Read Traffic: + avg: AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + unit: pct + Write and Atomic BW: + avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + unit: (Bytes + $normUnit) + HBM Write and Atomic Traffic: + avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: pct + Remote Write and Atomic Traffic: + avg: AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) + if (TCC_EA0_WRREQ_sum != 0) else None)) + min: MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) + if (TCC_EA0_WRREQ_sum != 0) else None)) + max: MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) + if (TCC_EA0_WRREQ_sum != 0) else None)) + unit: pct + Atomic Traffic: + avg: AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: pct + Uncached Write and Atomic Traffic: + avg: AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: pct + Read Latency: + avg: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + min: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + max: MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + unit: Cycles + Write and Atomic Latency: + avg: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + min: MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + max: MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: Cycles + Atomic Latency: + avg: AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else None)) + min: MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else None)) + max: MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else None)) + unit: Cycles + - metric_table: + id: 1703 + title: L2 Cache Accesses + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Bandwidth: + avg: AVG((TCC_REQ_sum * 128) / $denom) + min: MIN((TCC_REQ_sum * 128) / $denom) + max: MAX((TCC_REQ_sum * 128) / $denom) + unit: (Bytes + $normUnit) + Req: + avg: AVG((TCC_REQ_sum / $denom)) + min: MIN((TCC_REQ_sum / $denom)) + max: MAX((TCC_REQ_sum / $denom)) + unit: (Req + $normUnit) + Read Req: + avg: AVG((TCC_READ_sum / $denom)) + min: MIN((TCC_READ_sum / $denom)) + max: MAX((TCC_READ_sum / $denom)) + unit: (Req + $normUnit) + Write Req: + avg: AVG((TCC_WRITE_sum / $denom)) + min: MIN((TCC_WRITE_sum / $denom)) + max: MAX((TCC_WRITE_sum / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG((TCC_ATOMIC_sum / $denom)) + min: MIN((TCC_ATOMIC_sum / $denom)) + max: MAX((TCC_ATOMIC_sum / $denom)) + unit: (Req + $normUnit) + Streaming Req: + avg: AVG((TCC_STREAMING_REQ_sum / $denom)) + min: MIN((TCC_STREAMING_REQ_sum / $denom)) + max: MAX((TCC_STREAMING_REQ_sum / $denom)) + unit: (Req + $normUnit) + Probe Req: + avg: AVG((TCC_PROBE_sum / $denom)) + min: MIN((TCC_PROBE_sum / $denom)) + max: MAX((TCC_PROBE_sum / $denom)) + unit: (Req + $normUnit) + Cache Hit: + avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + unit: pct + Hits: + avg: AVG((TCC_HIT_sum / $denom)) + min: MIN((TCC_HIT_sum / $denom)) + max: MAX((TCC_HIT_sum / $denom)) + unit: (Hits + $normUnit) + Misses: + avg: AVG((TCC_MISS_sum / $denom)) + min: MIN((TCC_MISS_sum / $denom)) + max: MAX((TCC_MISS_sum / $denom)) + unit: (Misses + $normUnit) + Writeback: + avg: AVG((TCC_WRITEBACK_sum / $denom)) + min: MIN((TCC_WRITEBACK_sum / $denom)) + max: MAX((TCC_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + Writeback (Internal): + avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom)) + min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom)) + max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + Writeback (vL1D Req): + avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + Evict (Internal): + avg: AVG((TCC_NORMAL_EVICT_sum / $denom)) + min: MIN((TCC_NORMAL_EVICT_sum / $denom)) + max: MAX((TCC_NORMAL_EVICT_sum / $denom)) + unit: (Cachelines + $normUnit) + Evict (vL1D Req): + avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + unit: (Cachelines + $normUnit) + NC Req: + avg: AVG((TCC_NC_REQ_sum / $denom)) + min: MIN((TCC_NC_REQ_sum / $denom)) + max: MAX((TCC_NC_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC Req: + avg: AVG((TCC_UC_REQ_sum / $denom)) + min: MIN((TCC_UC_REQ_sum / $denom)) + max: MAX((TCC_UC_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC Req: + avg: AVG((TCC_CC_REQ_sum / $denom)) + min: MIN((TCC_CC_REQ_sum / $denom)) + max: MAX((TCC_CC_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW Req: + avg: AVG((TCC_RW_REQ_sum / $denom)) + min: MIN((TCC_RW_REQ_sum / $denom)) + max: MAX((TCC_RW_REQ_sum / $denom)) + unit: (Req + $normUnit) + - metric_table: + id: 1704 + title: L2 Cache Stalls + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: {} + - metric_table: + id: 1705 + title: L2 - Fabric Interface stalls + header: + metric: Metric + type: Type + transaction: Transaction + avg: Avg + min: Min + max: Max + unit: Unit + style: + type: simple_multi_bar + metric: + Write - Credit Starvation: + type: Credit Starvation + transaction: Write + avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum + != 0) else None)) + min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum + != 0) else None)) + max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum + != 0) else None)) + unit: pct + - metric_table: + id: 1706 + title: L2 - Fabric interface detailed metrics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Read (32B): + avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom)) + min: MIN((TCC_EA0_RDREQ_32B_sum / $denom)) + max: MAX((TCC_EA0_RDREQ_32B_sum / $denom)) + unit: (Req + $normUnit) + Read (64B): + avg: AVG(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) + min: MIN(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) + max: MAX(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) + unit: (Req + $normUnit) + Read (Uncached): + avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) + max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) + unit: (Req + $normUnit) + HBM Read: + avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)) + min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom)) + max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom)) + unit: (Req + $normUnit) + Remote Read: + avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) + min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) + max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) + unit: (Req + $normUnit) + Write and Atomic (32B): + avg: AVG(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) + min: MIN(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) + max: MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) + unit: (Req + $normUnit) + Write and Atomic (Uncached): + avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) + max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) + unit: (Req + $normUnit) + Write and Atomic (64B): + avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom)) + min: MIN((TCC_EA0_WRREQ_64B_sum / $denom)) + max: MAX((TCC_EA0_WRREQ_64B_sum / $denom)) + unit: (Req + $normUnit) + HBM Write and Atomic: + avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)) + min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom)) + max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom)) + unit: (Req + $normUnit) + Remote Write and Atomic: + avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) + min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) + max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) + unit: (Req + $normUnit) + Atomic: + avg: AVG((TCC_EA0_ATOMIC_sum / $denom)) + min: MIN((TCC_EA0_ATOMIC_sum / $denom)) + max: MAX((TCC_EA0_ATOMIC_sum / $denom)) + unit: (Req + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1800_L2_cache_per_channel.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1800_L2_cache_per_channel.yaml deleted file mode 100644 index 6b16e302cc..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1800_L2_cache_per_channel.yaml +++ /dev/null @@ -1,298 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1800 - title: L2 Cache (per Channel) - data source: - - metric_table: - id: 1801 - title: Aggregate Stats (All channels) - header: - metric: Metric - avg: Avg - std dev: Std Dev - min: Min - max: Max - unit: Unit - tips: Tips - metric: - L2 Cache Hit Rate: - avg: AVG(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) - + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * - TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) - + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 - * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) - / (((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) != 0) else None) - std dev: STD(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) - + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * - TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) - + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 - * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) - / (((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) != 0) else None) - min: MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) - + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * - TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) - + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 - * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) - / (((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) != 0) else None) - max: MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) - + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * - TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) - + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 - * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) - / (((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) != 0) else None) - unit: pct - tips: - # FIXME: other arggr metrics!! - - - metric_table: - id: 1802 - title: L2 Cache Hit Rate (pct) - header: - metric: Channel - expr: Expression - metric: - "::_1": - expr: - (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1] - + TCC_MISS[::_1]) != 0) else None) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_box - - - metric_table: - id: 1803 - title: L2 Requests (per normUnit) - header: - metric: Channel - expr: Expression - metric: - "::_1": - expr: (TO_INT(TCC_REQ[::_1]) / $denom) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_box - - - metric_table: - id: 1804 - title: L2 Requests (per normUnit) - header: - metric: Channel - read req: L2 Read - write req: L2 Write - atomic req: L2 Atomic - metric: - "::_1": - read req: AVG((TO_INT(TCC_READ[::_1]) / $denom)) - write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom)) - atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom)) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_multiple_bar - - - metric_table: - id: 1805 - title: L2-Fabric Requests (per normUnit) - header: - metric: Channel - read req: L2-Fabric Read - write req: L2-Fabric Write and Atomic - atomic req: L2-Fabric Atomic - metric: - "::_1": - read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom)) - write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom)) - atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom)) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_multiple_bar - - # - metric_table: - # id: 1806 - # title: L2-Fabric Latency (Cycles) - # header: - # metric: Metric - # read lat: L2-Fabric Read - # write lat: L2-Fabric Write - # atomic lat: L2-Fabric Atomic - # metric: - # "::_1": - # read lat: - # AVG(((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1] - # != 0) else None)) - # write lat: - # AVG(((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1] - # != 0) else None)) - # atomic lat: - # AVG(((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if - # (TCC_EA0_ATOMIC[::_1] != 0) else 0)) - # placeholder_range: - # "::_1": $total_l2_chan - # cli_style: simple_multiple_bar - - - metric_table: - id: 1806 - title: L2-Fabric Read Latency (Cycles) - header: - metric: Channel - expr: Expression - metric: - "::_1": - expr: - ((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1] - != 0) else None) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_box - - - metric_table: - id: 1807 - title: L2-Fabric Write and Atomic Latency (Cycles) - header: - metric: Channel - expr: Expression - metric: - "::_1": - expr: - ((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1] - != 0) else None) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_box - - - metric_table: - id: 1808 - title: L2-Fabric Atomic Latency (Cycles) - header: - metric: Channel - expr: Expression - metric: - "::_1": - expr: ((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if - (TCC_EA0_ATOMIC[::_1] != 0) else 0) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_box - - - metric_table: - id: 1809 - title: L2-Fabric Read Stall (Cycles per normUnit) - header: - metric: Channel - ea read stall - pcie: L2-Fabric Read Stall (PCIe) - ea read stall - if: L2-Fabric Read Stall (Infinity Fabric™) - ea read stall - hbm: L2-Fabric Read Stall (HBM) - metric: - "::_1": - ea read stall - pcie: None # Missing perfmon - ea read stall - if: None # Missing perfmon - ea read stall - hbm: None # Missing perfmon - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_multiple_bar - - - metric_table: - id: 1810 - title: L2-Fabric Write and Atomic Stall (Cycles per normUnit) - header: - metric: Channel - ea write stall - pcie: L2-Fabric Write Stall (PCIe) - ea write stall - gmi: L2-Fabric Write Stall (Infinity Fabric™) - ea write stall - dram: L2-Fabric Write Stall (HBM) - ea write stall - starve: L2-Fabric Write Starve - metric: - "::_1": - ea write stall - pcie: None # Missing perfmon - ea write stall - if: None # Missing perfmon - ea write stall - hbm: None # Missing perfmon - ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1]) / $denom)) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_multiple_bar - - # - metric_table: - # id: 1811 - # title: L2 Tag Stall (cycles) - # header: - # metric: Metric - # expr: Expression - # metric: - # "::_1": - # expr: TCC_TAG_STALL[::_1] - # placeholder_range: - # "::_1": $total_l2_chan - # cli_style: simple_box - - - metric_table: - id: 1812 - title: L2-Fabric (128B read requests per normUnit) - header: - metric: Channel - expr: Expression - metric: - "::_1": - expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom) - placeholder_range: - "::_1": $total_l2_chan - # tips: Number of 128-byte read requests sent to EA - cli_style: simple_box diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1800_l2_cache_per_channel.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1800_l2_cache_per_channel.yaml new file mode 100644 index 0000000000..849662871e --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1800_l2_cache_per_channel.yaml @@ -0,0 +1,251 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1800 + title: L2 Cache (per Channel) + metrics_description: + L2 Cache Hit Rate: The percent of total number of requests to the L2 from all + clients that hit in the cache. As noted in the Speed-of-Light section, this + includes hit-on-miss requests. + data source: + - metric_table: + id: 1801 + title: Aggregate Stats (All channels) + header: + metric: Metric + avg: Avg + std dev: Std Dev + min: Min + max: Max + unit: Unit + metric: + L2 Cache Hit Rate: + avg: AVG(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 + * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 + * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * + TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) + / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) != 0) else None) + std dev: STD(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 + * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 + * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * + TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) + / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) != 0) else None) + min: MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 + * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 + * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * + TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) + / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) != 0) else None) + max: MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 + * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 + * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * + TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) + / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) != 0) else None) + unit: pct + - metric_table: + id: 1802 + title: L2 Cache Hit Rate (pct) + header: + metric: Channel + expr: Expression + metric: + ::_1: + expr: (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1] + + TCC_MISS[::_1]) != 0) else None) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1803 + title: L2 Requests (per normUnit) + header: + metric: Channel + expr: Expression + metric: + ::_1: + expr: (TO_INT(TCC_REQ[::_1]) / $denom) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1804 + title: L2 Requests (per normUnit) + header: + metric: Channel + read req: L2 Read + write req: L2 Write + atomic req: L2 Atomic + metric: + ::_1: + read req: AVG((TO_INT(TCC_READ[::_1]) / $denom)) + write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom)) + atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom)) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_multiple_bar + tui_style: simple_multiple_bar + - metric_table: + id: 1805 + title: L2-Fabric Requests (per normUnit) + header: + metric: Channel + read req: L2-Fabric Read + write req: L2-Fabric Write and Atomic + atomic req: L2-Fabric Atomic + metric: + ::_1: + read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom)) + write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom)) + atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom)) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_multiple_bar + tui_style: simple_multiple_bar + - metric_table: + id: 1806 + title: L2-Fabric Read Latency (Cycles) + header: + metric: Channel + expr: Expression + metric: + ::_1: + expr: ((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1] + != 0) else None) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1807 + title: L2-Fabric Write and Atomic Latency (Cycles) + header: + metric: Channel + expr: Expression + metric: + ::_1: + expr: ((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1] + != 0) else None) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1808 + title: L2-Fabric Atomic Latency (Cycles) + header: + metric: Channel + expr: Expression + metric: + ::_1: + expr: ((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if (TCC_EA0_ATOMIC[::_1] + != 0) else 0) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1809 + title: L2-Fabric Read Stall (Cycles per normUnit) + header: + metric: Channel + ea read stall - pcie: L2-Fabric Read Stall (PCIe) + ea read stall - if: "L2-Fabric Read Stall (Infinity Fabric\u2122)" + ea read stall - hbm: L2-Fabric Read Stall (HBM) + metric: + ::_1: + ea read stall - pcie: None + ea read stall - if: None + ea read stall - hbm: None + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_multiple_bar + tui_style: simple_multiple_bar + - metric_table: + id: 1810 + title: L2-Fabric Write and Atomic Stall (Cycles per normUnit) + header: + metric: Channel + ea write stall - pcie: L2-Fabric Write Stall (PCIe) + ea write stall - if: "L2-Fabric Write Stall (Infinity Fabric\u2122)" + ea write stall - hbm: L2-Fabric Write Stall (HBM) + ea write stall - starve: L2-Fabric Write Starve + metric: + ::_1: + ea write stall - pcie: None + ea write stall - if: None + ea write stall - hbm: None + ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1]) + / $denom)) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_multiple_bar + tui_style: simple_multiple_bar + - metric_table: + id: 1812 + title: L2-Fabric (128B read requests per normUnit) + header: + metric: Channel + expr: Expression + metric: + ::_1: + expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/2100_pc_sampling.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/2100_pc_sampling.yaml index d6c4ff393d..e94471d7dc 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/2100_pc_sampling.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/2100_pc_sampling.yaml @@ -1,10 +1,11 @@ ---- +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py Panel Config: id: 2100 title: PC Sampling + metrics_description: {} data source: - - pc_sampling_table: - id: 2101 - title: PC Sampling - source: ps_file - comparable: false # enable it later + - pc_sampling_table: + id: 2101 + title: PC Sampling + source: ps_file + comparable: false diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0000_top_stats.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0000_top_stats.yaml index ccf1309850..55c6f6bb24 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0000_top_stats.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0000_top_stats.yaml @@ -1,14 +1,14 @@ ---- +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py Panel Config: - id: 000 + id: 0 title: Top Stats + metrics_description: {} data source: - - raw_csv_table: - id: 001 - title: Top Kernels - source: pmc_kernel_top.csv - - - raw_csv_table: - id: 002 - title: Dispatch List - source: pmc_dispatch_info.csv + - raw_csv_table: + id: 1 + title: Top Kernels + source: pmc_kernel_top.csv + - raw_csv_table: + id: 2 + title: Dispatch List + source: pmc_dispatch_info.csv diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0100_system_info.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0100_system_info.yaml index b7ec29eaf9..8470ffbbe3 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0100_system_info.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0100_system_info.yaml @@ -1,9 +1,10 @@ ---- +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py Panel Config: id: 100 title: System Info + metrics_description: {} data source: - - raw_csv_table: - id: 101 - source: sysinfo.csv - columnwise: True + - raw_csv_table: + id: 101 + source: sysinfo.csv + columnwise: true diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0200_system-speed-of-light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0200_system-speed-of-light.yaml deleted file mode 100644 index 68687f1c28..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0200_system-speed-of-light.yaml +++ /dev/null @@ -1,262 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - SALU: &SALU_anchor Scalar Arithmetic Logic Unit - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 200 - title: System Speed-of-Light - data source: - - metric_table: - id: 201 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - peak: Peak - pop: Pct of Peak - tips: Tips - metric: - VALU FLOPs: - value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) - + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) - + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + (64 * (((SQ_INSTS_VALU_ADD_F64 - + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (2 * SQ_INSTS_VALU_FMA_F64)))) - / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) / (((($max_sclk - * $cu_per_gpu) * 64) * 2) / 1000)) - tips: - VALU IOPs: - value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - Start_Timestamp))) - unit: GIOP/s - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) - tips: - MFMA FLOPs (F8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - tips: - MFMA FLOPs (BF16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) - tips: - MFMA FLOPs (F16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) - tips: - MFMA FLOPs (F32): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - tips: - MFMA FLOPs (F64): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - tips: - MFMA IOPs (Int8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GIOP/s - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - tips: - Active CUs: - value: $numActiveCUs - unit: CUs - peak: $cu_per_gpu - pop: ((100 * $numActiveCUs) / $cu_per_gpu) - tips: - SALU Utilization: - value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - peak: 100 - pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - tips: - VALU Utilization: - value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - peak: 100 - pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - tips: - MFMA Utilization: - value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu) - * 4))) - unit: pct - peak: 100 - pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu) - * 4))) - tips: - VMEM Utilization: - value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - peak: 100 - pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - tips: - Branch Utilization: - value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - peak: 100 - pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - tips: - VALU Active Threads: - value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - unit: Threads - peak: $wave_size - pop: (100 * AVG((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU / $wave_size) if (SQ_ACTIVE_INST_VALU != 0) else None)) - tips: - IPC: - value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - unit: Instr/cycle - peak: 5 - pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5) - tips: - Wavefront Occupancy: - value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - unit: Wavefronts - peak: ($max_waves_per_cu * $cu_per_gpu) - pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu - * $cu_per_gpu)))) - coll_level: SQ_LEVEL_WAVES - tips: - Theoretical LDS Bandwidth: - value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: (($max_sclk * $cu_per_gpu) * 0.128) - pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * 0.00128))) - tips: - LDS Bank Conflicts/Access: - value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - unit: Conflicts/access - peak: 32 - pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) / 32) - tips: - vL1D Cache Hit Rate: - value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - unit: pct - peak: 100 - pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / - TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - tips: - vL1D Cache BW: - value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu) - pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) - tips: - L2 Cache Hit Rate: - value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - unit: pct - peak: 100 - pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - tips: - L2 Cache BW: - value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)) - pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) - tips: - L2-Fabric Read BW: - value: AVG((128 * TCC_BUBBLE_sum + - 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + - 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)) - unit: GB/s - peak: $hbmBandwidth - pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + - 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + - 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth) - tips: - L2-Fabric Write BW: - value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: $hbmBandwidth - pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth) - tips: - L2-Fabric Read Latency: - value: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - unit: Cycles - peak: None - pop: None - tips: - L2-Fabric Write Latency: - value: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - unit: Cycles - peak: None - pop: None - tips: - sL1D Cache Hit Rate: - value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) - if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) - unit: pct - peak: 100 - pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) - if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) - tips: - sL1D Cache BW: - value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) - unit: GB/s - peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) - pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))) / ((($max_sclk - / 1000) * 64) * $sqc_per_gpu)) - tips: - L1I Hit Rate: - value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) - unit: pct - peak: 100 - pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) - tips: - L1I BW: - value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) - unit: GB/s - peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) - pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))) / ((($max_sclk - / 1000) * 64) * $sqc_per_gpu)) - tips: - L1I Fetch Latency: - value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - unit: Cycles - peak: None - pop: None - coll_level: SQ_IFETCH_LEVEL - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0200_system_speed_of_light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0200_system_speed_of_light.yaml new file mode 100644 index 0000000000..e8aa26a3e1 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0200_system_speed_of_light.yaml @@ -0,0 +1,346 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 200 + title: System Speed-of-Light + metrics_description: + VALU FLOPs: 'The total floating-point operations executed per second on the VALU. + This is also presented as a percent of the peak theoretical FLOPs achievable + on the specific accelerator. Note: this does not include any floating-point + operations from MFMA instructions.' + VALU IOPs: 'The total integer operations executed per second on the VALU. This + is also presented as a percent of the peak theoretical IOPs achievable on the + specific accelerator. Note: this does not include any integer operations from + MFMA instructions.' + MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations + executed per second. This does not include any 16-bit brain floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F8 MFMA operations achievable on the specific accelerator. It is supported on + AMD Instinct MI300 series and later only. + MFMA FLOPs (BF16): 'The total number of 16-bit brain floating point MFMA operations + executed per second. Note: this does not include any 16-bit brain floating point + operations from VALU instructions. This is also presented as a percent of the + peak theoretical BF16 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F16): 'The total number of 16-bit floating point MFMA operations executed + per second. Note: this does not include any 16-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F16 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F32): 'The total number of 32-bit floating point MFMA operations executed + per second. Note: this does not include any 32-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F32 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F64): 'The total number of 64-bit floating point MFMA operations executed + per second. Note: this does not include any 64-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F64 MFMA operations achievable on the specific accelerator.' + MFMA IOPs (Int8): 'The total number of 8-bit integer MFMA operations executed + per second. Note: this does not include any 8-bit integer operations from VALU + instructions. This is also presented as a percent of the peak theoretical INT8 + MFMA operations achievable on the specific accelerator.' + Active CUs: Total number of active compute units (CUs) on the accelerator during + the kernel execution. + SALU Utilization: Indicates what percent of the kernel's duration the SALU was + busy executing instructions. Computed as the ratio of the total number of cycles + spent by the scheduler issuing SALU or SMEM instructions over the total CU cycles. + VALU Utilization: Indicates what percent of the kernel's duration the VALU was + busy executing instructions. Does not include VMEM operations. Computed as the + ratio of the total number of cycles spent by the scheduler issuing VALU instructions + over the total CU cycles. + MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit + was busy executing instructions. Computed as the ratio of the total number of + cycles the MFMA was busy over the total CU cycles. + VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit + was busy executing instructions, including both global/generic and spill/scratch + operations (see the VMEM instruction count metrics) for more detail). Does not + include VALU operations. Computed as the ratio of the total number of cycles + spent by the scheduler issuing VMEM instructions over the total CU cycles. + Branch Utilization: Indicates what percent of the kernel's duration the branch + unit was busy executing instructions. Computed as the ratio of the total number + of cycles spent by the scheduler issuing branch instructions over the total + CU cycles + VALU Active Threads: Indicates the average level of divergence within a wavefront + over the lifetime of the kernel. The number of work-items that were active in + a wavefront during execution of each VALU instruction, time-averaged over all + VALU instructions run on all wavefronts in the kernel. + IPC: The ratio of the total number of instructions executed on the CU over the + total active CU cycles. This is also presented as a percent of the peak theoretical + bandwidth achievable on the specific accelerator. + Wavefront Occupancy: 'The time-averaged number of wavefronts resident on the accelerator + over the lifetime of the kernel. Note: this metric may be inaccurate for short-running + kernels (less than 1ms). This is also presented as a percent of the peak theoretical + occupancy achievable on the specific accelerator.' + Theoretical LDS Bandwidth: Indicates the maximum amount of bytes that could have + been loaded from, stored to, or atomically updated in the LDS per unit time + (see LDS Bandwidth example for more detail). This is also presented as a percent + of the peak theoretical F64 MFMA operations achievable on the specific accelerator. + LDS Bank Conflicts/Access: The ratio of the number of cycles spent in the LDS + scheduler due to bank conflicts (as determined by the conflict resolution hardware) + to the base number of cycles that would be spent in the LDS scheduler in a completely + uncontended case. This is also presented in normalized form (i.e., the Bank + Conflict Rate). + vL1D Cache Hit Rate: The ratio of the number of vL1D cache line requests that + hit in vL1D cache over the total number of cache line requests to the vL1D cache + RAM. + vL1D Cache BW: The number of bytes looked up in the vL1D cache as a result of + VMEM instructions per unit time. The number of bytes is calculated as the number + of cache lines requested multiplied by the cache line size. This value does + not consider partial requests, so e.g., if only a single value is requested + in a cache line, the data movement will still be counted as a full cache line. + This is also presented as a percent of the peak theoretical bandwidth achievable + on the specific accelerator. + L2 Cache Hit Rate: The ratio of the number of L2 cache line requests that hit + in the L2 cache over the total number of incoming cache line requests to the + L2 cache. + L2 Cache BW: The number of bytes looked up in the L2 cache per unit time. The + number of bytes is calculated as the number of cache lines requested multiplied + by the cache line size. This value does not consider partial requests, so e.g., + if only a single value is requested in a cache line, the data movement will + still be counted as a full cache line. This is also presented as a percent of + the peak theoretical bandwidth achievable on the specific accelerator. + L2-Fabric Read BW: "The number of bytes read by the L2 over the Infinity Fabric\u2122\ + \ interface per unit time. This is also presented as a percent of the peak theoretical\ + \ bandwidth achievable on the specific accelerator." + L2-Fabric Write BW: The number of bytes sent by the L2 over the Infinity Fabric + interface by write and atomic operations per unit time. This is also presented + as a percent of the peak theoretical bandwidth achievable on the specific accelerator. + L2-Fabric Read Latency: The time-averaged number of cycles read requests spent + in Infinity Fabric before data was returned to the L2. + L2-Fabric Write Latency: The time-averaged number of cycles write requests spent + in Infinity Fabric before a completion acknowledgement was returned to the L2. + sL1D Cache Hit Rate: The percent of sL1D requests that hit on a previously loaded + line the cache. Calculated as the ratio of the number of sL1D requests that + hit over the number of all sL1D requests. + sL1D Cache BW: The number of bytes looked up in the sL1D cache per unit time. + This is also presented as a percent of the peak theoretical bandwidth achievable + on the specific accelerator. + L1I Hit Rate: The number of bytes looked up in the L1I cache per unit time. This + is also presented as a percent of the peak theoretical bandwidth achievable + on the specific accelerator. + L1I BW: The percent of L1I requests that hit on a previously loaded line the cache. + Calculated as the ratio of the number of L1I requests that hit over the number + of all L1I requests. + L1I Fetch Latency: The average number of cycles spent to fetch instructions to + a CU. + data source: + - metric_table: + id: 201 + title: System Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + peak: Peak + pop: Pct of Peak + metric: + VALU FLOPs: + value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) + / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) + VALU IOPs: + value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp + - Start_Timestamp))) + unit: GIOP/s + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp + - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) + MFMA FLOPs (F8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + MFMA FLOPs (BF16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) + MFMA FLOPs (F16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) + MFMA FLOPs (F32): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + MFMA FLOPs (F64): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + MFMA IOPs (Int8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GIOP/s + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + Active CUs: + value: $numActiveCUs + unit: CUs + peak: $cu_per_gpu + pop: ((100 * $numActiveCUs) / $cu_per_gpu) + SALU Utilization: + value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + VALU Utilization: + value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + MFMA Utilization: + value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu) * 4))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu) * 4))) + VMEM Utilization: + value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + unit: pct + peak: 100 + pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + Branch Utilization: + value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + peak: 100 + pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + VALU Active Threads: + value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + unit: Threads + peak: $wave_size + pop: (100 * AVG((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU / $wave_size) + if (SQ_ACTIVE_INST_VALU != 0) else None)) + IPC: + value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + unit: Instr/cycle + peak: 5 + pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5) + Wavefront Occupancy: + value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + unit: Wavefronts + peak: ($max_waves_per_cu * $cu_per_gpu) + pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu + * $cu_per_gpu)))) + coll_level: SQ_LEVEL_WAVES + Theoretical LDS Bandwidth: + value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: (($max_sclk * $cu_per_gpu) * 0.128) + pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * 0.00128))) + LDS Bank Conflicts/Access: + value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + unit: Conflicts/access + peak: 32 + pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) / 32) + vL1D Cache Hit Rate: + value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None)) + unit: pct + peak: 100 + pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None)) + vL1D Cache BW: + value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu) + pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) + L2 Cache Hit Rate: + value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + unit: pct + peak: 100 + pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + L2 Cache BW: + value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)) + pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) + / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) + L2-Fabric Read BW: + value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum + - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp + - Start_Timestamp)) + unit: GB/s + peak: $hbmBandwidth + pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum + - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp + - Start_Timestamp)))) / $hbmBandwidth) + L2-Fabric Write BW: + value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: $hbmBandwidth + pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - + TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))) / + $hbmBandwidth) + L2-Fabric Read Latency: + value: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + unit: Cycles + peak: None + pop: None + L2-Fabric Write Latency: + value: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: Cycles + peak: None + pop: None + sL1D Cache Hit Rate: + value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) + if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) + unit: pct + peak: 100 + pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) + if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) + sL1D Cache BW: + value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) + unit: GB/s + peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) + pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * + 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) + L1I Hit Rate: + value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) + unit: pct + peak: 100 + pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) + L1I BW: + value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) + unit: GB/s + peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) + pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * + 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) + L1I Fetch Latency: + value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + unit: Cycles + peak: None + pop: None + coll_level: SQ_IFETCH_LEVEL diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0300_mem_chart.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0300_mem_chart.yaml deleted file mode 100644 index eae47b787f..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0300_mem_chart.yaml +++ /dev/null @@ -1,315 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 300 - title: Memory Chart - data source: - - metric_table: - id: 301 - title: Memory Chart - header: - metric: Metric - #alias: #alias - value: Value - tips: Tips - metric: - # ---------------------------------------- - # Instr Buff Block - - #TODO: double check wave_occupancy - Wavefront Occupancy: - #alias: wave_occ_ - value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs), 0) - coll_level: SQ_LEVEL_WAVES - tips: - Wave Life: - #alias: wave_life_ - value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0) else 0)), 0) - tips: - - # ---------------------------------------- - # Instr Dispatch Block - SALU: - #alias: salu_ - value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0) - tips: - SMEM: - #alias: smem_ - value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0) - tips: - VALU: - #alias: valu_ - value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0) - tips: - MFMA: - #alias: mfma_ - value: ROUND(AVG((SQ_INSTS_MFMA / $denom)), 0) - tips: - VMEM: - #alias: vmem_ - value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0) - tips: - LDS: - #alias: lds_ - value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) - tips: - GWS: - #alias: gws_ - value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0) - tips: - BR: - #alias: br_ - value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0) - tips: - - # ---------------------------------------- - # Exec Block - Active CUs: - #alias: active_cu_ - value: $numActiveCUs - tips: - Num CUs: - #alias: num_cu_ - value: $cu_per_gpu - tips: - VGPR: - #alias: vgpr_ - value: ROUND(AVG(Arch_VGPR), 0) - tips: - # Todo: add AGPRs - SGPR: - #alias: sgpr_ - value: ROUND(AVG(SGPR), 0) - tips: - LDS Allocation: - #alias: lds_alloc_ - value: ROUND(AVG(LDS_Per_Workgroup), 0) - tips: - Scratch Allocation: - #alias: scratch_alloc_ - value: ROUND(AVG(Scratch_Per_Workitem), 0) - tips: - Wavefronts: - #alias: wavefronts_ - value: ROUND(AVG(SPI_CSN_WAVE), 0) - tips: - Workgroups: - #alias: workgroups_ - value: ROUND(AVG(SPI_CSN_NUM_THREADGROUPS), 0) - tips: - - # ---------------------------------------- - # LDS Block - LDS Req: - #alias: lds_req_ - value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) - tips: - LDS Util: - #alias: lds_util_ - value: - ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))), - 0) - tips: - LDS Latency: - #alias: lds_lat - value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)),0) - coll_level: SQ_INST_LEVEL_LDS - tips: - - # ---------------------------------------- - # Vector L1 Cache Block - VL1 Rd: - #alias: vl1_rd_ - value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0) - tips: - VL1 Wr: - #alias: vl1_wr_ - value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0) - tips: - VL1 Atomic: - #alias: vl1_atom_ - value: - ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)), 0) - tips: - - VL1 Hit: - #alias: vl1_hit_ - value: - ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None )), 0) - tips: - VL1 Lat: - #alias: vl1_lat_ - value: - ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum - != 0) else None)), 0) - tips: - VL1 Coalesce: - #alias: vl1_coales_ - value: - ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum - * 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0) - tips: - VL1 Stall: - #alias: vl1_stall_ - value: - ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None)), 0) - tips: - - VL1_L2 Rd: - #alias: vl1_l2_rd_ - value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0) - tips: - VL1_L2 Wr: - #alias: vl1_l2_wr_ - value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0) - tips: - VL1_L2 Atomic: - #alias: vl1_l2_atom_ - value: - ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)), 0) - tips: - - # ---------------------------------------- - # Scalar L1D Cache Block - VL1D Rd: - #alias: sl1_rd_ - value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0) - tips: - VL1D Hit: - #alias: sl1_hit_ - value: - ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ != - 0) else None)) * 100), 0) - tips: - VL1D Lat: - #alias: sl1_lat_ - value: - ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ != - 0) else None)) * 100), 0) - coll_level: SQC_DCACHE_INFLIGHT_LEVEL - tips: - - VL1D_L2 Rd: - #alias: sl1_l2_rd_ - value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0) - tips: - VL1D_L2 Wr: - #alias: sl1_l2_wr_ - value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0) - tips: - VL1D_L2 Atomic: - #alias: sl1_l2_atom_ - value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0) - tips: - - # ---------------------------------------- - # Instr L1 Cache Block - IL1 Fetch: - #alias: il1_fetch_ - value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0) - tips: - IL1 Hit: - #alias: il1_hit_ - value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0) - tips: - IL1 Lat: - #alias: il1_lat_ - value: - ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ != - 0) else None)) * 100), 0) - tips: # ??? coll_level: SQ_IFETCH_LEVEL - IL1_L2 Rd: - #alias: il1_l2_req_ - value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0) - tips: - - # ---------------------------------------- - # L2 Cache Block(inside) - L2 Rd: - #alias: l2_rd_ - value: ROUND(AVG((TCC_READ_sum / $denom)), 0) - tips: - L2 Wr: - #alias: l2_wr_ - value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0) - tips: - L2 Atomic: - #alias: l2_atom_ - value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0) - tips: - L2 Hit: - #alias: l2_hit_ - value: - ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else 0)), 0) - tips: - L2 Rd Lat: - #alias: l2_rd_lat_ - value: - # ROUND(AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) - # if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else None)), - # 0) - tips: - L2 Wr Lat: - #alias: l2_wr_lat_ - value: - # ROUND(AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + - # TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - # != 0) else None)), 0) - tips: - - # ---------------------------------------- - # Fabric Block - Fabric_L2 Rd: - #alias: l2_fabric_rd_ - value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0) - tips: - Fabric_L2 Wr: - #alias: l2_fabric_wr_ - value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0) - tips: - Fabric_L2 Atomic: - #alias: l2_fabric_atom_ - value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0) - tips: - - Fabric Rd Lat: - #alias: fabric_rd_lat_ - value: - ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else 0)), 0) - tips: - Fabric Wr Lat: - #alias: fabric_wr_lat_ - value: - ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else 0)), 0) - tips: - Fabric Atomic Lat: - #alias: fabric_atom_lat_ - value: - ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum - != 0) else 0)), 0) - tips: - - HBM Rd: - #alias: hbm_rd_ - value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0) - tips: - HBM Wr: - #alias: hbm_wr_ - value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0) - tips: - - comparable: false # for now - cli_style: mem_chart diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0300_memory_chart.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0300_memory_chart.yaml new file mode 100644 index 0000000000..1a6587ce82 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0300_memory_chart.yaml @@ -0,0 +1,263 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 300 + title: Memory Chart + metrics_description: + Wavefront Occupancy: Wavefronts per active CU. + Wave Life: Average number of cycles executing a wave. + SALU: Total Number of SALU (Scalar ALU) instructions issued per normalization + unit. + SMEM: Total number of SMEM (Scalar Memory Read) instructions issued normalization + unit. + VALU: The number of VALU (Vector ALU) instructions issued per normalization unit. + MFMA: Total number of MFMA (Matrix-Fused-Multiply-Add) instructions issued per + normalization unit. + VMEM: The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch + memory) per normalization unit. + LDS: The total number of LDS instructions (including, but not limited to, read/write/atomics + and HIP's __shfl instructions) executed per normalization unit. + GWS: Total number of GDS (global data sync) instructions issued per normalization + unit. + BR: Total number of BRANCH instructions issued per normalization unit. + Active CUs: Total number of active compute units (CUs) on the accelerator during + the kernel execution. + Num CUs: Total number of compute units (CUs) on the accelerator. + VGPR: 'The number of architected vector general-purpose registers allocated for + the kernel, see VALU. Note: this may not exactly match the number of VGPRs requested + by the compiler due to allocation granularity.' + SGPR: 'The number of scalar general-purpose registers allocated for the kernel, + see SALU. Note: this may not exactly match the number of SGPRs requested by + the compiler due to allocation granularity.' + LDS Allocation: 'The number of bytes of LDS memory (or, shared memory) allocated + for this kernel. Note: This may also be larger than what was requested at compile + time due to both allocation granularity and dynamic per-dispatch LDS allocations.' + Scratch Allocation: The number of bytes of scratch memory requested per work-item + for this kernel. Scratch memory is used for stack memory on the accelerator, + as well as for register spills and restores. + Wavefronts: The total number of wavefronts, summed over all workgroups, forming + this kernel launch. + Workgroups: The total number of workgroups forming this kernel launch. + LDS Req: The total number of LDS instructions (including, but not limited to, + read/write/atomics and HIP's __shfl instructions) executed per normalization + unit. + LDS Util: Indicates what percent of the kernel's duration the LDS was actively + executing instructions (including, but not limited to, load, store, atomic and + HIP's __shfl operations). Calculated as the ratio of the total number of cycles + LDS was active over the total CU cycles. + LDS Latency: The average number of round-trip cycles (i.e., from issue to data-return + / acknowledgment) required for an LDS instruction to complete. + VL1 Rd: The total number of incoming read requests from the address processing + unit after coalescing per normalization unit + VL1 Wr: The total number of incoming write requests from the address processing + unit after coalescing per normalization unit + VL1 Atomic: The total number of incoming atomic requests from the address processing + unit after coalescing per normalization unit + VL1 Hit: The ratio of the number of vL1D cache line requests that hit in vL1D + cache over the total number of cache line requests to the vL1D Cache RAM. + VL1 Lat: Calculated as the average number of cycles that a vL1D cache line request + spent in the vL1D cache pipeline. + VL1 Coalesce: Indicates how well memory instructions were coalesced by the address + processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). Calculated + as the average number of thread-requests generated per instruction divided by + the ideal number of thread-requests per instruction. + VL1 Stall: The ratio of the number of cycles where the vL1D is stalled waiting + to issue a request for data to the L2 cache divided by the number of cycles + where the vL1D is active. + VL1_L2 Rd: The number of read requests for a vL1D cache line that were not satisfied + by the vL1D and must be retrieved from the to the L2 Cache per normalization + unit. + VL1_L2 Wr: The number of write requests to a vL1D cache line that were sent through + the vL1D to the L2 cache, per normalization unit. + VL1_L2 Atomic: The number of atomic requests that are sent through the vL1D to + the L2 cache, per normalization unit. This includes requests for atomics with, + and without return. + sL1D Rd: The total number of requests, of any size or type, made to the sL1D per + normalization unit. + sL1D Hit: The total number of sL1D requests that hit on a previously loaded cache + line, per normalization unit. + sL1D_L2 Rd: The total number of read requests from sL1D to the L2, per normalization + unit. + sL1D_L2 Wr: The total number of write requests from sL1D to the L2, per normalization + unit. Typically unused on current CDNA accelerators. + sL1D_L2 Atomic: The total number of atomic requests from sL1D to the L2, per normalization + unit. Typically unused on current CDNA accelerators. + IL1 Fetch: The total number of requests made to the L1I per normalization-unit. + IL1 Hit: The percent of L1I requests that hit on a previously loaded line the + cache. Calculated as the ratio of the number of L1I requests that hit over the + number of all L1I requests. + IL1 Lat: The average number of cycles spent to fetch instructions to a CU. + IL1_L2 Rd: The total number of requests across the L1I - L2 interface per normalization-unit. + L2 Rd: The total number of read requests to the L2 from all clients. + L2 Wr: The total number of write requests to the L2 from all clients. + L2 Atomic: The total number of atomic requests (with and without return) to the + L2 from all clients. + L2 Hit: The ratio of the number of L2 cache line requests that hit in the L2 cache + over the total number of incoming cache line requests to the L2 cache. + L2 Rd Lat: Calculated as the average number of cycles that the vL1D cache took + to issue and receive read requests from the L2 Cache. This number also includes + requests for atomics with return values. + L2 Wr Lat: Calculated as the average number of cycles that the vL1D cache took + to issue and receive acknowledgement of a write request to the L2 Cache. This + number also includes requests for atomics without return values. + Fabric_L2 Rd: Number of L2 cache - Infinity Fabric read requests (either 32-byte + or 64-byte) summed over TCC instances per normalization unit. + Fabric_L2 Wr: Number of L2 cache - Infinity Fabric write requests (either 32-byte + or 64-byte) summed over TCC instances per normalization unit. + Fabric_L2 Atomic: Number of L2 cache - Infinity Fabric write requests (either + 32-byte or 64-byte) that are actually atomic requests summed over TCC instances + per normalization unit. + Fabric Rd Lat: The time-averaged number of cycles read requests spent in Infinity + Fabric before data was returned to the L2. + Fabric Wr Lat: The time-averaged number of cycles write requests spent in Infinity + Fabric before a completion acknowledgement was returned to the L2. + Fabric Atomic Lat: The time-averaged number of cycles atomic requests spent in + Infinity Fabric before a completion acknowledgement (atomic without return value) + or data (atomic with return value) was returned to the L2. + HBM Rd: The total number of L2 requests to Infinity Fabric to read 32B or 64B + of data from the accelerator's local HBM, per normalization unit. + HBM Wr: 'The total number of L2 requests to Infinity Fabric to write or atomically + update 32B or 64B of data in the accelerator''s local HBM, per normalization + unit. ' + data source: + - metric_table: + id: 301 + title: Memory Chart + header: + metric: Metric + value: Value + metric: + Wavefront Occupancy: + value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs), + 0) + coll_level: SQ_LEVEL_WAVES + Wave Life: + value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0) else + 0)), 0) + SALU: + value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0) + SMEM: + value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0) + VALU: + value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0) + MFMA: + value: ROUND(AVG((SQ_INSTS_MFMA / $denom)), 0) + VMEM: + value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0) + LDS: + value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) + GWS: + value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0) + BR: + value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0) + Active CUs: + value: $numActiveCUs + Num CUs: + value: $cu_per_gpu + VGPR: + value: ROUND(AVG(Arch_VGPR), 0) + SGPR: + value: ROUND(AVG(SGPR), 0) + LDS Allocation: + value: ROUND(AVG(LDS_Per_Workgroup), 0) + Scratch Allocation: + value: ROUND(AVG(Scratch_Per_Workitem), 0) + Wavefronts: + value: ROUND(AVG(SPI_CSN_WAVE), 0) + Workgroups: + value: ROUND(AVG(SPI_CSN_NUM_THREADGROUPS), 0) + LDS Req: + value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) + LDS Util: + value: ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))), 0) + LDS Latency: + value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS + != 0) else None)),0) + coll_level: SQ_INST_LEVEL_LDS + VL1 Rd: + value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0) + VL1 Wr: + value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0) + VL1 Atomic: + value: ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)), 0) + VL1 Hit: + value: ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None )), 0) + VL1 Lat: + value: ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum + != 0) else None)), 0) + VL1 Coalesce: + value: ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum + * 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0) + VL1 Stall: + value: ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None)), 0) + VL1_L2 Rd: + value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0) + VL1_L2 Wr: + value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0) + VL1_L2 Atomic: + value: ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)), 0) + sL1D Rd: + value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0) + sL1D Hit: + value: ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ + != 0) else None)) * 100), 0) + sL1D Lat: + value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ + != 0) else None)) * 100), 0) + coll_level: SQC_DCACHE_INFLIGHT_LEVEL + sL1D_L2 Rd: + value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0) + sL1D_L2 Wr: + value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0) + sL1D_L2 Atomic: + value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0) + IL1 Fetch: + value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0) + IL1 Hit: + value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0) + IL1 Lat: + value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ + != 0) else None)) * 100), 0) + IL1_L2 Rd: + value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0) + L2 Rd: + value: ROUND(AVG((TCC_READ_sum / $denom)), 0) + L2 Wr: + value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0) + L2 Atomic: + value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0) + L2 Hit: + value: ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if + ((TCC_HIT_sum + TCC_MISS_sum) != 0) else 0)), 0) + L2 Rd Lat: + value: null + L2 Wr Lat: + value: null + Fabric_L2 Rd: + value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0) + Fabric_L2 Wr: + value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0) + Fabric_L2 Atomic: + value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0) + Fabric Rd Lat: + value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else 0)), 0) + Fabric Wr Lat: + value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else 0)), 0) + Fabric Atomic Lat: + value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else 0)), 0) + HBM Rd: + value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0) + HBM Wr: + value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0) + comparable: false + cli_style: mem_chart + tui_style: mem_chart diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0400_roofline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0400_roofline.yaml new file mode 100644 index 0000000000..41c8bac547 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0400_roofline.yaml @@ -0,0 +1,9 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 400 + title: Roofline + metrics_description: {} + data source: + - None: + id: 401 + title: Roofline diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0400_roofline_info.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0400_roofline_info.yaml deleted file mode 100644 index 1474b85cf2..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0400_roofline_info.yaml +++ /dev/null @@ -1,8 +0,0 @@ ---- -Panel Config: - id: 400 - title: Roofline - data source: - - None: - id: 401 - title: Roofline \ No newline at end of file diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0500_command-processor.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0500_command-processor.yaml deleted file mode 100644 index 164b3552bf..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0500_command-processor.yaml +++ /dev/null @@ -1,135 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 500 - title: Command Processor (CPC/CPF) - data source: - - metric_table: - id: 501 - title: Command Processor Fetcher - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - CPF Utilization: - avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) - if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) - min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) - if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) - max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) - if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) - unit: pct - tips: - CPF Stall: - avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None)) - min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None)) - max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None)) - unit: pct - tips: - CPF-L2 Utilization: - avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) - if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) - min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) - if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) - max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) - if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) - unit: pct - tips: - CPF-L2 Stall: - avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY - != 0) else None)) - min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY - != 0) else None)) - max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY - != 0) else None)) - unit: pct - tips: - CPF-UTCL1 Stall: - avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None) - min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None) - max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None) - unit: pct - tips: - - - metric_table: - id: 502 - title: Packet Processor - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - CPC Utilization: - avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) - if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) - min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) - if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) - max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) - if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) - unit: pct - tips: - CPC Stall Rate: - avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - unit: pct - tips: - CPC Packet Decoding Utilization: - avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) - min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) - max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) - unit: pct - tips: - CPC-Workgroup Manager Utilization: - avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) - min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) - max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) - unit: Pct - tips: - CPC-L2 Utilization: - avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) - if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) - min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) - if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) - max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) - if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) - unit: pct - tips: - CPC-UTCL1 Stall: - avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None) - min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None) - max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None) - unit: pct - tips: - CPC-UTCL2 Utilization: - avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) - if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) - min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) - if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) - max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) - if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) - unit: pct - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0500_command_processor_cpc_cpf.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0500_command_processor_cpc_cpf.yaml new file mode 100644 index 0000000000..c4d2cabf52 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0500_command_processor_cpc_cpf.yaml @@ -0,0 +1,145 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 500 + title: Command Processor (CPC/CPF) + metrics_description: + CPF Utilization: Percent of total cycles where the CPF was busy actively doing + any work. The ratio of CPF busy cycles over total cycles counted by the CPF. + CPF Stall: Percent of CPF busy cycles where the CPF was stalled for any reason. + CPF-L2 Utilization: Percent of total cycles counted by the CPF-L2 interface where + the CPF-L2 interface was active doing any work. The ratio of CPF-L2 busy cycles + over total cycles counted by the CPF-L2. + CPF-L2 Stall: Percent of CPF-L2 L2 busy cycles where the CPF-L2 interface was + stalled for any reason. + CPF-UTCL1 Stall: Percent of CPF busy cycles where the CPF was stalled by address + translation. + CPC Utilization: Percent of total cycles where the CPC was busy actively doing + any work. The ratio of CPC busy cycles over total cycles counted by the CPC. + CPC Stall Rate: Percent of CPC busy cycles where the CPC was stalled for any reason. + CPC Packet Decoding Utilization: Percent of CPC busy cycles spent decoding commands + for processing. + CPC-Workgroup Manager Utilization: Percent of CPC busy cycles spent dispatching + workgroups to the workgroup manager. + CPC-L2 Utilization: Percent of total cycles counted by the CPC-L2 interface where + the CPC-L2 interface was active doing any work. + CPC-UTCL1 Stall: Percent of CPC busy cycles where the CPC was stalled by address + translation + CPC-UTCL2 Utilization: 'Percent of total cycles counted by the CPC''s L2 address + translation interface where the CPC was busy doing address translation work. ' + data source: + - metric_table: + id: 501 + title: Command processor fetcher (CPF) + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + CPF Utilization: + avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) + if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) + min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) + if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) + max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) + if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) + unit: pct + CPF Stall: + avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None)) + min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None)) + max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None)) + unit: pct + CPF-L2 Utilization: + avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) + if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) + min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) + if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) + max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) + if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) + unit: pct + CPF-L2 Stall: + avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY + != 0) else None)) + min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY + != 0) else None)) + max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY + != 0) else None)) + unit: pct + CPF-UTCL1 Stall: + avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) + if (CPF_CPF_STAT_BUSY != 0) else None) + min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) + if (CPF_CPF_STAT_BUSY != 0) else None) + max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) + if (CPF_CPF_STAT_BUSY != 0) else None) + unit: pct + - metric_table: + id: 502 + title: Command processor packet processor (CPC) + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + CPC Utilization: + avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) + if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) + min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) + if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) + max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) + if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) + unit: pct + CPC Stall Rate: + avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + unit: pct + CPC Packet Decoding Utilization: + avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + unit: pct + CPC-Workgroup Manager Utilization: + avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + unit: Pct + CPC-L2 Utilization: + avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) + if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) + min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) + if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) + max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) + if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) + unit: pct + CPC-UTCL1 Stall: + avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if + (CPC_CPC_STAT_BUSY != 0) else None) + min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if + (CPC_CPC_STAT_BUSY != 0) else None) + max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if + (CPC_CPC_STAT_BUSY != 0) else None) + unit: pct + CPC-UTCL2 Utilization: + avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) + if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) + min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) + if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) + max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) + if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) + unit: pct diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0600_shader-processor-input.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0600_shader-processor-input.yaml deleted file mode 100644 index c78c3645a0..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0600_shader-processor-input.yaml +++ /dev/null @@ -1,167 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 600 - title: Workgroup Manager (SPI) - data source: - - metric_table: - id: 601 - title: Workgroup Manager Utilizations - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Accelerator Utilization: - avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) - min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) - max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) - unit: Pct - tips: - Scheduler-Pipe Utilization: - avg: AVG(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - min: MIN(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - max: MAX(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - unit: Pct - tips: - Workgroup Manager Utilization: - avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) - min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) - max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) - unit: Pct - tips: - Shader Engine Utilization: - avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) - min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) - max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) - unit: Pct - tips: - SIMD Utilization: - avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Dispatched Workgroups: - avg: AVG(SPI_CSN_NUM_THREADGROUPS) - min: MIN(SPI_CSN_NUM_THREADGROUPS) - max: MAX(SPI_CSN_NUM_THREADGROUPS) - unit: Workgroups - tips: - Dispatched Wavefronts: - avg: AVG(SPI_CSN_WAVE) - min: MIN(SPI_CSN_WAVE) - max: MAX(SPI_CSN_WAVE) - unit: Wavefronts - tips: - VGPR Writes: - avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - unit: Cycles/wave - tips: - SGPR Writes: - avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - unit: Cycles/wave - tips: - - metric_table: - id: 602 - title: Workgroup Manager - Resource Allocation - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Not-scheduled Rate (Workgroup Manager): - avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None) - min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None) - max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None) - unit: Pct - tips: - Not-scheduled Rate (Scheduler-Pipe): - avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None) - min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None) - max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None) - unit: Pct - tips: - Scheduler-Pipe Stall Rate: - avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None)) - min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None)) - max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None)) - unit: Pct - tips: - Scratch Stall Rate: - avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - unit: Pct - tips: - Insufficient SIMD Waveslots: - avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Insufficient SIMD VGPRs: - avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Insufficient SIMD SGPRs: - avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Insufficient CU LDS: - avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Insufficient CU Barriers: - avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Reached CU Workgroup Limit: - avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Reached CU Wavefront Limit: - avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0600_workgroup_manager_spi.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0600_workgroup_manager_spi.yaml new file mode 100644 index 0000000000..f6bf13d8b8 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0600_workgroup_manager_spi.yaml @@ -0,0 +1,201 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 600 + title: Workgroup Manager (SPI) + metrics_description: + Accelerator Utilization: The percent of cycles in the kernel where the accelerator + was actively doing any work. + Scheduler-Pipe Utilization: The percent of total scheduler-pipe cycles in the + kernel where the scheduler-pipes were actively doing any work. + Workgroup Manager Utilization: The percent of cycles in the kernel where the workgroup + manager was actively doing any work. + Shader Engine Utilization: The percent of total shader engine cycles in the kernel + where any CU in a shader-engine was actively doing any work, normalized over + all shader-engines. Low values (e.g., << 100%) indicate that the accelerator + was not fully saturated by the kernel, or a potential load-imbalance issue. + SIMD Utilization: The percent of total SIMD cycles in the kernel where any SIMD + on a CU was actively doing any work, summed over all CUs. Low values (less than + 100%) indicate that the accelerator was not fully saturated by the kernel, or + a potential load-imbalance issue. + Dispatched Workgroups: The total number of workgroups forming this kernel launch. + Dispatched Wavefronts: The total number of wavefronts, summed over all workgroups, + forming this kernel launch. + VGPR Writes: The average number of cycles spent initializing VGPRs at wave creation. + SGPR Writes: The average number of cycles spent initializing SGPRs at wave creation. + Not-scheduled Rate (Workgroup Manager): The percent of total scheduler-pipe cycles + in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck + within the workgroup manager rather than a lack of a CU or SIMD with sufficient + resources. + Not-scheduled Rate (Scheduler-Pipe): 'The percent of total scheduler-pipe cycles + in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck + within the scheduler-pipes rather than a lack of a CU or SIMD with sufficient + resources. ' + Scheduler-Pipe Stall Rate: The percent of total scheduler-pipe cycles in the kernel + where a workgroup could not be scheduled to a CU due to occupancy limitations + (like a lack of a CU or SIMD with sufficient resources). + Scratch Stall Rate: The percent of total shader-engine cycles in the kernel where + a workgroup could not be scheduled to a CU due to lack of private (a.k.a., scratch) + memory slots. While this can reach up to 100%, note that the actual occupancy + limitations on a kernel using private memory are typically quite small (for + example, less than 1% of the total number of waves that can be scheduled to + an accelerator). + Insufficient SIMD Waveslots: The percent of total SIMD cycles in the kernel where + a workgroup could not be scheduled to a SIMD due to lack of available waveslots. + Insufficient SIMD VGPRs: The percent of total SIMD cycles in the kernel where + a workgroup could not be scheduled to a SIMD due to lack of available VGPRs. + Insufficient SIMD SGPRs: The percent of total SIMD cycles in the kernel where + a workgroup could not be scheduled to a SIMD due to lack of available SGPRs. + Insufficient CU LDS: The percent of total CU cycles in the kernel where a workgroup + could not be scheduled to a CU due to lack of available LDS. + Insufficient CU Barriers: The percent of total CU cycles in the kernel where a + workgroup could not be scheduled to a CU due to lack of available barriers. + Reached CU Workgroup Limit: The percent of total CU cycles in the kernel where + a workgroup could not be scheduled to a CU due to limits within the workgroup + manager. This is expected to be always be zero on CDNA2 or newer accelerators + (and small for previous accelerators). + Reached CU Wavefront Limit: The percent of total CU cycles in the kernel where + a wavefront could not be scheduled to a CU due to limits within the workgroup + manager. This is expected to be always be zero on CDNA2 or newer accelerators + (and small for previous accelerators). + data source: + - metric_table: + id: 601 + title: Workgroup manager utilizations + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Accelerator Utilization: + avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) + min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) + max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) + unit: Pct + Scheduler-Pipe Utilization: + avg: AVG(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu + * $se_per_gpu)) + min: MIN(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu + * $se_per_gpu)) + max: MAX(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu + * $se_per_gpu)) + unit: Pct + Workgroup Manager Utilization: + avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) + min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) + max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) + unit: Pct + Shader Engine Utilization: + avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) + min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) + max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) + unit: Pct + SIMD Utilization: + avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Dispatched Workgroups: + avg: AVG(SPI_CSN_NUM_THREADGROUPS) + min: MIN(SPI_CSN_NUM_THREADGROUPS) + max: MAX(SPI_CSN_NUM_THREADGROUPS) + unit: Workgroups + Dispatched Wavefronts: + avg: AVG(SPI_CSN_WAVE) + min: MIN(SPI_CSN_WAVE) + max: MAX(SPI_CSN_WAVE) + unit: Wavefronts + VGPR Writes: + avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + unit: Cycles/wave + SGPR Writes: + avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + unit: Cycles/wave + - metric_table: + id: 602 + title: Workgroup Manager - Resource Allocation + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Not-scheduled Rate (Workgroup Manager): + avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + unit: Pct + Not-scheduled Rate (Scheduler-Pipe): + avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + unit: Pct + Scheduler-Pipe Stall Rate: + avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) + min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) + max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) + unit: Pct + Scratch Stall Rate: + avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + unit: Pct + Insufficient SIMD Waveslots: + avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Insufficient SIMD VGPRs: + avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Insufficient SIMD SGPRs: + avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Insufficient CU LDS: + avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Insufficient CU Barriers: + avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Reached CU Workgroup Limit: + avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Reached CU Wavefront Limit: + avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0700_wavefront-launch.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0700_wavefront-launch.yaml deleted file mode 100644 index cc650e9bc0..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0700_wavefront-launch.yaml +++ /dev/null @@ -1,142 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 700 - title: Wavefront - data source: - - metric_table: - id: 701 - title: Wavefront Launch Stats - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Grid Size: - avg: AVG(Grid_Size) - min: MIN(Grid_Size) - max: MAX(Grid_Size) - unit: Work Items - tips: - Workgroup Size: - avg: AVG(Workgroup_Size) - min: MIN(Workgroup_Size) - max: MAX(Workgroup_Size) - unit: Work Items - tips: - Total Wavefronts: - avg: AVG(SPI_CSN_WAVE) - min: MIN(SPI_CSN_WAVE) - max: MAX(SPI_CSN_WAVE) - unit: Wavefronts - tips: - Saved Wavefronts: - avg: AVG(SQ_WAVES_SAVED) - min: MIN(SQ_WAVES_SAVED) - max: MAX(SQ_WAVES_SAVED) - unit: Wavefronts - tips: - Restored Wavefronts: - avg: AVG(SQ_WAVES_RESTORED) - min: MIN(SQ_WAVES_RESTORED) - max: MAX(SQ_WAVES_RESTORED) - unit: Wavefronts - tips: - VGPRs: - avg: AVG(Arch_VGPR) - min: MIN(Arch_VGPR) - max: MAX(Arch_VGPR) - unit: Registers - tips: - AGPRs: - avg: AVG(Accum_VGPR) - min: MIN(Accum_VGPR) - max: MAX(Accum_VGPR) - unit: Registers - tips: - SGPRs: - avg: AVG(SGPR) - min: MIN(SGPR) - max: MAX(SGPR) - unit: Registers - tips: - LDS Allocation: - avg: AVG(LDS_Per_Workgroup) - min: MIN(LDS_Per_Workgroup) - max: MAX(LDS_Per_Workgroup) - unit: Bytes - tips: - Scratch Allocation: - avg: AVG(Scratch_Per_Workitem) - min: MIN(Scratch_Per_Workitem) - max: MAX(Scratch_Per_Workitem) - unit: Bytes/Workitem - tips: - - - metric_table: - id: 702 - title: Wavefront Runtime Stats - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Kernel Time: - avg: AVG((End_Timestamp - Start_Timestamp)) - min: MIN((End_Timestamp - Start_Timestamp)) - max: MAX((End_Timestamp - Start_Timestamp)) - unit: ns - tips: - Kernel Time (Cycles): - avg: AVG($GRBM_GUI_ACTIVE_PER_XCD) - min: MIN($GRBM_GUI_ACTIVE_PER_XCD) - max: MAX($GRBM_GUI_ACTIVE_PER_XCD) - unit: Cycle - tips: - Instructions per wavefront: - avg: AVG((SQ_INSTS / SQ_WAVES)) - min: MIN((SQ_INSTS / SQ_WAVES)) - max: MAX((SQ_INSTS / SQ_WAVES)) - unit: Instr/wavefront - tips: - Wave Cycles: - avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom)) - min: MIN(((4 * SQ_WAVE_CYCLES) / $denom)) - max: MAX(((4 * SQ_WAVE_CYCLES) / $denom)) - unit: (Cycles + $normUnit) - tips: - Dependency Wait Cycles: - avg: AVG(((4 * SQ_WAIT_ANY) / $denom)) - min: MIN(((4 * SQ_WAIT_ANY) / $denom)) - max: MAX(((4 * SQ_WAIT_ANY) / $denom)) - unit: (Cycles + $normUnit) - tips: - Issue Wait Cycles: - avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom)) - min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom)) - max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom)) - unit: (Cycles + $normUnit) - tips: - Active Cycles: - avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom)) - min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom)) - max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom)) - unit: (Cycles + $normUnit) - tips: - Wavefront Occupancy: - avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - unit: Wavefronts - coll_level: SQ_LEVEL_WAVES - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0700_wavefront.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0700_wavefront.yaml new file mode 100644 index 0000000000..5e332c0b8f --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0700_wavefront.yaml @@ -0,0 +1,173 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 700 + title: Wavefront + metrics_description: + Grid Size: The total number of work-items (or, threads) launched as a part of + the kernel dispatch. In HIP, this is equivalent to the total grid size multiplied + by the total workgroup (or, block) size. + Workgroup Size: The total number of work-items (or, threads) in each workgroup + (or, block) launched as part of the kernel dispatch. In HIP, this is equivalent + to the total block size. + Total Wavefronts: "The total number of wavefronts launched as part of the kernel\ + \ dispatch. On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs,\ + \ the wavefront size is always 64 work-items. Thus, the total number of wavefronts\ + \ should be equivalent to the ceiling of grid size divided by 64." + Saved Wavefronts: The total number of wavefronts saved at a context-save. + Restored Wavefronts: The total number of wavefronts restored from a context-save. + VGPRs: 'The number of architected vector general-purpose registers allocated for + the kernel, see VALU. Note: this may not exactly match the number of VGPRs requested + by the compiler due to allocation granularity.' + AGPRs: 'The number of accumulation vector general-purpose registers allocated + for the kernel, see AGPRs. Note: this may not exactly match the number of AGPRs + requested by the compiler due to allocation granularity.' + SGPRs: 'The number of scalar general-purpose registers allocated for the kernel, + see SALU. Note: this may not exactly match the number of SGPRs requested by + the compiler due to allocation granularity.' + LDS Allocation: 'The number of bytes of LDS memory (or, shared memory) allocated + for this kernel. Note: This may also be larger than what was requested at compile + time due to both allocation granularity and dynamic per-dispatch LDS allocations.' + Scratch Allocation: The number of bytes of scratch memory requested per work-item + for this kernel. Scratch memory is used for stack memory on the accelerator, + as well as for register spills and restores. + Kernel Time: The total duration of the executed kernel. + Kernel Time (Cycles): The total duration of the executed kernel in cycles. + Instructions per wavefront: The average number of instructions (of all types) + executed per wavefront. This is averaged over all wavefronts in a kernel dispatch. + Wave Cycles: The number of cycles a wavefront in the kernel dispatch spent resident + on a compute unit per normalization unit. This is averaged over all wavefronts + in a kernel dispatch. + Dependency Wait Cycles: The number of cycles a wavefront in the kernel dispatch + spent resident on a compute unit per normalization unit. This is averaged over + all wavefronts in a kernel dispatch. + Issue Wait Cycles: The number of cycles a wavefront in the kernel dispatch was + unable to issue an instruction for any reason (e.g., execution pipe back-pressure, + arbitration loss, etc.) per normalization unit. This counter is incremented + at every cycle by all wavefronts on a CU unable to issue an instruction. As + such, it is most useful to get a sense of how waves were spending their time, + rather than identification of a precise limiter because another wave could be + actively executing while a wave is issue stalled. The sum of this metric, Dependency + Wait Cycles and Active Cycles should be equal to the total Wave Cycles metric. + Active Cycles: The average number of cycles a wavefront in the kernel dispatch + was actively executing instructions per normalization unit. This measurement + is made on a per-wavefront basis, and may include cycles that another wavefront + spent actively executing (on another execution unit, for example) or was stalled. + As such, it is most useful to get a sense of how waves were spending their time, + rather than identification of a precise limiter. The sum of this metric, Issue + Wait Cycles and Active Wait Cycles should be equal to the total Wave Cycles + metric. + Wavefront Occupancy: 'The time-averaged number of wavefronts resident on the accelerator + over the lifetime of the kernel. Note: this metric may be inaccurate for short-running + kernels (less than 1ms).' + data source: + - metric_table: + id: 701 + title: Wavefront Launch Stats + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Grid Size: + avg: AVG(Grid_Size) + min: MIN(Grid_Size) + max: MAX(Grid_Size) + unit: Work Items + Workgroup Size: + avg: AVG(Workgroup_Size) + min: MIN(Workgroup_Size) + max: MAX(Workgroup_Size) + unit: Work Items + Total Wavefronts: + avg: AVG(SPI_CSN_WAVE) + min: MIN(SPI_CSN_WAVE) + max: MAX(SPI_CSN_WAVE) + unit: Wavefronts + Saved Wavefronts: + avg: AVG(SQ_WAVES_SAVED) + min: MIN(SQ_WAVES_SAVED) + max: MAX(SQ_WAVES_SAVED) + unit: Wavefronts + Restored Wavefronts: + avg: AVG(SQ_WAVES_RESTORED) + min: MIN(SQ_WAVES_RESTORED) + max: MAX(SQ_WAVES_RESTORED) + unit: Wavefronts + VGPRs: + avg: AVG(Arch_VGPR) + min: MIN(Arch_VGPR) + max: MAX(Arch_VGPR) + unit: Registers + AGPRs: + avg: AVG(Accum_VGPR) + min: MIN(Accum_VGPR) + max: MAX(Accum_VGPR) + unit: Registers + SGPRs: + avg: AVG(SGPR) + min: MIN(SGPR) + max: MAX(SGPR) + unit: Registers + LDS Allocation: + avg: AVG(LDS_Per_Workgroup) + min: MIN(LDS_Per_Workgroup) + max: MAX(LDS_Per_Workgroup) + unit: Bytes + Scratch Allocation: + avg: AVG(Scratch_Per_Workitem) + min: MIN(Scratch_Per_Workitem) + max: MAX(Scratch_Per_Workitem) + unit: Bytes/Workitem + - metric_table: + id: 702 + title: Wavefront Runtime Stats + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Kernel Time: + avg: AVG((End_Timestamp - Start_Timestamp)) + min: MIN((End_Timestamp - Start_Timestamp)) + max: MAX((End_Timestamp - Start_Timestamp)) + unit: ns + Kernel Time (Cycles): + avg: AVG($GRBM_GUI_ACTIVE_PER_XCD) + min: MIN($GRBM_GUI_ACTIVE_PER_XCD) + max: MAX($GRBM_GUI_ACTIVE_PER_XCD) + unit: Cycle + Instructions per wavefront: + avg: AVG((SQ_INSTS / SQ_WAVES)) + min: MIN((SQ_INSTS / SQ_WAVES)) + max: MAX((SQ_INSTS / SQ_WAVES)) + unit: Instr/wavefront + Wave Cycles: + avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom)) + min: MIN(((4 * SQ_WAVE_CYCLES) / $denom)) + max: MAX(((4 * SQ_WAVE_CYCLES) / $denom)) + unit: (Cycles + $normUnit) + Dependency Wait Cycles: + avg: AVG(((4 * SQ_WAIT_ANY) / $denom)) + min: MIN(((4 * SQ_WAIT_ANY) / $denom)) + max: MAX(((4 * SQ_WAIT_ANY) / $denom)) + unit: (Cycles + $normUnit) + Issue Wait Cycles: + avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom)) + min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom)) + max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom)) + unit: (Cycles + $normUnit) + Active Cycles: + avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom)) + min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom)) + max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom)) + unit: (Cycles + $normUnit) + Wavefront Occupancy: + avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + unit: Wavefronts + coll_level: SQ_LEVEL_WAVES diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1000_compute-unit-instruction-mix.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1000_compute-unit-instruction-mix.yaml deleted file mode 100644 index 83ba5367a7..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1000_compute-unit-instruction-mix.yaml +++ /dev/null @@ -1,277 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1000 - title: Compute Units - Instruction Mix - data source: - - metric_table: - id: 1001 - title: Overall Instruction Mix - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - VALU: - avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) - min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) - max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) - unit: (instr + $normUnit) - tips: - VMEM: - # TODO: need to fix this when the new FLAT/LDS counts - # are present in ROCm - avg: AVG(((SQ_INSTS_VMEM) / $denom)) - min: MIN(((SQ_INSTS_VMEM) / $denom)) - max: MAX(((SQ_INSTS_VMEM) / $denom)) - unit: (instr + $normUnit) - tips: - LDS: - # TODO: need to fix this when the new FLAT/LDS counts - # are present in ROCm - avg: AVG((SQ_INSTS_LDS / $denom)) - min: MIN((SQ_INSTS_LDS / $denom)) - max: MAX((SQ_INSTS_LDS / $denom)) - unit: (instr + $normUnit) - tips: - MFMA: - avg: AVG((SQ_INSTS_MFMA / $denom)) - min: MIN((SQ_INSTS_MFMA / $denom)) - max: MAX((SQ_INSTS_MFMA / $denom)) - unit: (instr + $normUnit) - tips: - SALU: - avg: AVG((SQ_INSTS_SALU / $denom)) - min: MIN((SQ_INSTS_SALU / $denom)) - max: MAX((SQ_INSTS_SALU / $denom)) - unit: (instr + $normUnit) - tips: - SMEM: - avg: AVG((SQ_INSTS_SMEM / $denom)) - min: MIN((SQ_INSTS_SMEM / $denom)) - max: MAX((SQ_INSTS_SMEM / $denom)) - unit: (instr + $normUnit) - tips: - Branch: - avg: AVG((SQ_INSTS_BRANCH / $denom)) - min: MIN((SQ_INSTS_BRANCH / $denom)) - max: MAX((SQ_INSTS_BRANCH / $denom)) - unit: (instr + $normUnit) - tips: - - - metric_table: - id: 1002 - title: VALU Arithmetic Instr Mix - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - INT32: - avg: AVG((SQ_INSTS_VALU_INT32 / $denom)) - min: MIN((SQ_INSTS_VALU_INT32 / $denom)) - max: MAX((SQ_INSTS_VALU_INT32 / $denom)) - unit: (instr + $normUnit) - tips: - INT64: - avg: AVG((SQ_INSTS_VALU_INT64 / $denom)) - min: MIN((SQ_INSTS_VALU_INT64 / $denom)) - max: MAX((SQ_INSTS_VALU_INT64 / $denom)) - unit: (instr + $normUnit) - tips: - F16-ADD: - avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom)) - unit: (instr + $normUnit) - tips: - F16-MUL: - avg: AVG((SQ_INSTS_VALU_MUL_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_MUL_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_MUL_F16 / $denom)) - unit: (instr + $normUnit) - tips: - F16-FMA: - avg: AVG((SQ_INSTS_VALU_FMA_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_FMA_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_FMA_F16 / $denom)) - unit: (instr + $normUnit) - tips: - F16-Trans: - avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom)) - unit: (instr + $normUnit) - tips: - F32-ADD: - avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom)) - unit: (instr + $normUnit) - tips: - F32-MUL: - avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom)) - unit: (instr + $normUnit) - tips: - F32-FMA: - avg: AVG((SQ_INSTS_VALU_FMA_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_FMA_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_FMA_F32 / $denom)) - unit: (instr + $normUnit) - tips: - F32-Trans: - avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom)) - unit: (instr + $normUnit) - tips: - F64-ADD: - avg: AVG((SQ_INSTS_VALU_ADD_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_ADD_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_ADD_F64 / $denom)) - unit: (instr + $normUnit) - tips: - F64-MUL: - avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom)) - unit: (instr + $normUnit) - tips: - F64-FMA: - avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom)) - unit: (instr + $normUnit) - tips: - F64-Trans: - avg: AVG((SQ_INSTS_VALU_TRANS_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_TRANS_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_TRANS_F64 / $denom)) - unit: (instr + $normUnit) - tips: - Conversion: - avg: AVG((SQ_INSTS_VALU_CVT / $denom)) - min: MIN((SQ_INSTS_VALU_CVT / $denom)) - max: MAX((SQ_INSTS_VALU_CVT / $denom)) - unit: (instr + $normUnit) - tips: - - - metric_table: - id: 1003 - title: VMEM Instr Mix - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Global/Generic Instr: - avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Global/Generic Read: - avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Global/Generic Write: - avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Global/Generic Atomic: - avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Spill/Stack Instr: - avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Spill/Stack Read: - avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Spill/Stack Write: - avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Spill/Stack Atomic: - avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - - - metric_table: - id: 1004 - title: MFMA Arithmetic Instr Mix - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - MFMA-I8: - avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom)) - unit: (instr + $normUnit) - tips: - MFMA-F8: - avg: AVG((SQ_INSTS_VALU_MFMA_F8 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F8 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F8 / $denom)) - unit: (instr + $normUnit) - tips: - MFMA-F16: - avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom)) - unit: (instr + $normUnit) - tips: - MFMA-BF16: - avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom)) - unit: (instr + $normUnit) - tips: - MFMA-F32: - avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom)) - unit: (instr + $normUnit) - tips: - MFMA-F64: - avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom)) - unit: (instr + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1000_compute_units_instruction_mix.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1000_compute_units_instruction_mix.yaml new file mode 100644 index 0000000000..9c923d7bb7 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1000_compute_units_instruction_mix.yaml @@ -0,0 +1,309 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1000 + title: Compute Units - Instruction Mix + metrics_description: + VALU: The total number of vector arithmetic logic unit (VALU) operations issued. + These are the workhorses of the compute unit, and are used to execute a wide + range of instruction types including floating point operations, non-uniform + address calculations, transcendental operations, integer operations, shifts, + conditional evaluation, etc. + VMEM: The total number of vector memory operations issued. These include most + loads, stores and atomic operations and all accesses to generic, global, private + and texture memory. + LDS: The total number of LDS (also known as shared memory) operations issued. + These include loads, stores, atomics, and HIP's __shfl operations. + MFMA: The total number of matrix fused multiply-add instructions issued. + SALU: The total number of scalar arithmetic logic unit (SALU) operations issued. + Typically these are used for address calculations, literal constants, and other + operations that are provably uniform across a wavefront. Although scalar memory + (SMEM) operations are issued by the SALU, they are counted separately in this + section. + SMEM: The total number of scalar memory (SMEM) operations issued. These are typically + used for loading kernel arguments, base-pointers and loads from HIP's __constant__ + memory. + Branch: The total number of branch operations issued. These typically consist + of jump or branch operations and are used to implement control flow. + INT32: The total number of instructions operating on 32-bit integer operands issued + to the VALU per normalization unit. + INT64: The total number of instructions operating on 64-bit integer operands issued + to the VALU per normalization unit. + F16-ADD: The total number of addition instructions operating on 16-bit floating-point + operands issued to the VALU per normalization unit. + F16-MUL: The total number of multiplication instructions operating on 16-bit floating-point + operands issued to the VALU per normalization unit. + F16-FMA: The total number of fused multiply-add instructions operating on 16-bit + floating-point operands issued to the VALU per normalization unit. + F16-Trans: The total number of transcendental instructions (e.g., sqrt) operating + on 16-bit floating-point operands issued to the VALU per normalization unit. + F32-ADD: The total number of addition instructions operating on 32-bit floating-point + operands issued to the VALU per normalization unit. + F32-MUL: The total number of multiplication instructions operating on 32-bit floating-point + operands issued to the VALU per normalization unit. + F32-FMA: The total number of fused multiply-add instructions operating on 32-bit + floating-point operands issued to the VALU per normalization unit. + F32-Trans: The total number of transcendental instructions (such as sqrt) operating + on 32-bit floating-point operands issued to the VALU per normalization unit. + F64-ADD: The total number of addition instructions operating on 64-bit floating-point + operands issued to the VALU per normalization unit. + F64-MUL: The total number of multiplication instructions operating on 64-bit floating-point + operands issued to the VALU per normalization unit. + F64-FMA: The total number of fused multiply-add instructions operating on 64-bit + floating-point operands issued to the VALU per normalization unit. + F64-Trans: The total number of transcendental instructions (such as sqrt) operating + on 64-bit floating-point operands issued to the VALU per normalization unit. + Conversion: "The total number of type conversion instructions (such as converting\ + \ data to or from F32\u2194F64) issued to the VALU per normalization unit." + Global/Generic Instr: The total number of global & generic memory instructions + executed on all compute units on the accelerator, per normalization unit. + Global/Generic Read: The total number of global & generic memory read instructions + executed on all compute units on the accelerator, per normalization unit. + Global/Generic Write: The total number of global & generic memory write instructions + executed on all compute units on the accelerator, per normalization unit. + Global/Generic Atomic: The total number of global & generic memory atomic (with + and without return) instructions executed on all compute units on the accelerator, + per normalization unit. + Spill/Stack Instr: The total number of spill/stack memory instructions executed + on all compute units on the accelerator, per normalization unit. + Spill/Stack Read: The total number of spill/stack memory read instructions executed + on all compute units on the accelerator, per normalization unit. + Spill/Stack Write: The total number of spill/stack memory write instructions executed + on all compute units on the accelerator, per normalization unit. + Spill/Stack Atomic: The total number of spill/stack memory atomic (with and without + return) instructions executed on all compute units on the accelerator, per normalization + unit. Typically unused as these memory operations are typically used to implement + thread-local storage. + MFMA-I8: The total number of 8-bit integer MFMA instructions issued per normalization + unit. + MFMA-F8: The total number of 8-bit floating point MFMA instructions issued per + normalization unit. This is supported in AMD Instinct MI300 series and later + only. + MFMA-F16: The total number of 16-bit floating point MFMA instructions issued per + normalization unit. + MFMA-BF16: The total number of 16-bit brain floating point MFMA instructions issued + per normalization unit. + MFMA-F32: The total number of 32-bit floating-point MFMA instructions issued per + normalization unit. + MFMA-F64: The total number of 64-bit floating-point MFMA instructions issued per + normalization unit. + data source: + - metric_table: + id: 1001 + title: Overall Instruction Mix + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + VALU: + avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) + min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) + max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) + unit: (instr + $normUnit) + VMEM: + avg: AVG(((SQ_INSTS_VMEM) / $denom)) + min: MIN(((SQ_INSTS_VMEM) / $denom)) + max: MAX(((SQ_INSTS_VMEM) / $denom)) + unit: (instr + $normUnit) + LDS: + avg: AVG((SQ_INSTS_LDS / $denom)) + min: MIN((SQ_INSTS_LDS / $denom)) + max: MAX((SQ_INSTS_LDS / $denom)) + unit: (instr + $normUnit) + MFMA: + avg: AVG((SQ_INSTS_MFMA / $denom)) + min: MIN((SQ_INSTS_MFMA / $denom)) + max: MAX((SQ_INSTS_MFMA / $denom)) + unit: (instr + $normUnit) + SALU: + avg: AVG((SQ_INSTS_SALU / $denom)) + min: MIN((SQ_INSTS_SALU / $denom)) + max: MAX((SQ_INSTS_SALU / $denom)) + unit: (instr + $normUnit) + SMEM: + avg: AVG((SQ_INSTS_SMEM / $denom)) + min: MIN((SQ_INSTS_SMEM / $denom)) + max: MAX((SQ_INSTS_SMEM / $denom)) + unit: (instr + $normUnit) + Branch: + avg: AVG((SQ_INSTS_BRANCH / $denom)) + min: MIN((SQ_INSTS_BRANCH / $denom)) + max: MAX((SQ_INSTS_BRANCH / $denom)) + unit: (instr + $normUnit) + - metric_table: + id: 1002 + title: VALU Arithmetic Instruction Mix + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + INT32: + avg: AVG((SQ_INSTS_VALU_INT32 / $denom)) + min: MIN((SQ_INSTS_VALU_INT32 / $denom)) + max: MAX((SQ_INSTS_VALU_INT32 / $denom)) + unit: (instr + $normUnit) + INT64: + avg: AVG((SQ_INSTS_VALU_INT64 / $denom)) + min: MIN((SQ_INSTS_VALU_INT64 / $denom)) + max: MAX((SQ_INSTS_VALU_INT64 / $denom)) + unit: (instr + $normUnit) + F16-ADD: + avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom)) + unit: (instr + $normUnit) + F16-MUL: + avg: AVG((SQ_INSTS_VALU_MUL_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_MUL_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_MUL_F16 / $denom)) + unit: (instr + $normUnit) + F16-FMA: + avg: AVG((SQ_INSTS_VALU_FMA_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_FMA_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_FMA_F16 / $denom)) + unit: (instr + $normUnit) + F16-Trans: + avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom)) + unit: (instr + $normUnit) + F32-ADD: + avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom)) + unit: (instr + $normUnit) + F32-MUL: + avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom)) + unit: (instr + $normUnit) + F32-FMA: + avg: AVG((SQ_INSTS_VALU_FMA_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_FMA_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_FMA_F32 / $denom)) + unit: (instr + $normUnit) + F32-Trans: + avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom)) + unit: (instr + $normUnit) + F64-ADD: + avg: AVG((SQ_INSTS_VALU_ADD_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_ADD_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_ADD_F64 / $denom)) + unit: (instr + $normUnit) + F64-MUL: + avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom)) + unit: (instr + $normUnit) + F64-FMA: + avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom)) + unit: (instr + $normUnit) + F64-Trans: + avg: AVG((SQ_INSTS_VALU_TRANS_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_TRANS_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_TRANS_F64 / $denom)) + unit: (instr + $normUnit) + Conversion: + avg: AVG((SQ_INSTS_VALU_CVT / $denom)) + min: MIN((SQ_INSTS_VALU_CVT / $denom)) + max: MAX((SQ_INSTS_VALU_CVT / $denom)) + unit: (instr + $normUnit) + - metric_table: + id: 1003 + title: VMEM Instruction Mix + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Global/Generic Instr: + avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Global/Generic Read: + avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Global/Generic Write: + avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Global/Generic Atomic: + avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Instr: + avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Read: + avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Write: + avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Atomic: + avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + - metric_table: + id: 1004 + title: MFMA Arithmetic Instruction Mix + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + MFMA-I8: + avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom)) + unit: (instr + $normUnit) + MFMA-F8: + avg: AVG((SQ_INSTS_VALU_MFMA_F8 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F8 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F8 / $denom)) + unit: (instr + $normUnit) + MFMA-F16: + avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom)) + unit: (instr + $normUnit) + MFMA-BF16: + avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + unit: (instr + $normUnit) + MFMA-F32: + avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom)) + unit: (instr + $normUnit) + MFMA-F64: + avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom)) + unit: (instr + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1100_compute-unit-compute-pipeline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1100_compute-unit-compute-pipeline.yaml deleted file mode 100644 index 3821a9d879..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1100_compute-unit-compute-pipeline.yaml +++ /dev/null @@ -1,273 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1100 - title: Compute Units - Compute Pipeline - data source: - - metric_table: - id: 1101 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - peak: Peak - pop: Pct of Peak - tips: Tips - metric: - VALU FLOPs: - value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) - + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) - + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + (64 * (((SQ_INSTS_VALU_ADD_F64 - + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (2 * SQ_INSTS_VALU_FMA_F64)))) - / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) / (((($max_sclk - * $cu_per_gpu) * 64) * 2) / 1000)) - tips: - VALU IOPs: - value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - Start_Timestamp))) - unit: GIOP - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) - tips: - MFMA FLOPs (F8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - tips: - MFMA FLOPs (BF16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) - tips: - MFMA FLOPs (F16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) - tips: - MFMA FLOPs (F32): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - tips: - MFMA FLOPs (F64): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - tips: - MFMA IOPs (INT8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GIOP - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - tips: - - - metric_table: - id: 1102 - title: Pipeline Stats - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - IPC: - avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - unit: Instr/cycle - tips: - IPC (Issued): - avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) - + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) - / SQ_ACTIVE_INST_ANY)) - min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) - + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) - / SQ_ACTIVE_INST_ANY)) - max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) - + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) - / SQ_ACTIVE_INST_ANY)) - unit: Instr/cycle - tips: - SALU Utilization: - avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - tips: - VALU Utilization: - avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - tips: - VMEM Utilization: - avg: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - tips: - Branch Utilization: - avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - tips: - VALU Active Threads: - avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - unit: Threads - tips: - MFMA Utilization: - avg: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) - min: MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) - max: MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) - unit: pct - tips: - MFMA Instr Cycles: - avg: AVG(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA != 0) - else None)) - min: MIN(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA != 0) - else None)) - max: MAX(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA != 0) - else None)) - unit: cycles/instr - tips: - VMEM Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) - else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) - else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) - else None)) - unit: Cycles - coll_level: SQ_INST_LEVEL_VMEM - tips: - SMEM Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) - else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) - else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) - else None)) - unit: Cycles - coll_level: SQ_INST_LEVEL_SMEM - tips: - - - metric_table: - id: 1103 - title: Arithmetic Operations - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - FLOPs (Total): - avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) - + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 - * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) - + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / - $denom)) - min: MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) - + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 - * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) - + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / - $denom)) - max: MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) - + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 - * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) - + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / - $denom)) - unit: (OPs + $normUnit) - tips: - IOPs (Total): - avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / $denom) - min: MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / $denom) - max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / $denom) - unit: (OPs + $normUnit) - tips: - F8 OPs: - avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) - min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) - max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) - unit: (OPs + $normUnit) - tips: - F16 OPs: - avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + - (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 * - SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) - min: MIN(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + - (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 * - SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) - max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + - (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 * - SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) - unit: (OPs + $normUnit) - tips: - BF16 OPs: - avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) - min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) - max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) - unit: (OPs + $normUnit) - tips: - F32 OPs: - avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) / $denom)) - min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) / $denom)) - max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) / $denom)) - unit: (OPs + $normUnit) - tips: - F64 OPs: - avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) - min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) - max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) - unit: (OPs + $normUnit) - tips: - INT8 OPs: - avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) - min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) - max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) - unit: (OPs + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1100_compute_units_compute_pipeline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1100_compute_units_compute_pipeline.yaml new file mode 100644 index 0000000000..5285c6b279 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1100_compute_units_compute_pipeline.yaml @@ -0,0 +1,330 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1100 + title: Compute Units - Compute Pipeline + metrics_description: + VALU FLOPs: 'The total floating-point operations executed per second on the VALU. + This is also presented as a percent of the peak theoretical FLOPs achievable + on the specific accelerator. Note: this does not include any floating-point + operations from MFMA instructions.' + VALU IOPs: 'The total integer operations executed per second on the VALU. This + is also presented as a percent of the peak theoretical IOPs achievable on the + specific accelerator. Note: this does not include any integer operations from + MFMA instructions.' + MFMA FLOPs (BF16): 'The total number of 16-bit brain floating point MFMA operations + executed per second. Note: this does not include any 16-bit brain floating point + operations from VALU instructions. This is also presented as a percent of the + peak theoretical BF16 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F16): 'The total number of 16-bit floating point MFMA operations executed + per second. Note: this does not include any 16-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F16 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F32): 'The total number of 32-bit floating point MFMA operations executed + per second. Note: this does not include any 32-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F32 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F64): 'The total number of 64-bit floating point MFMA operations executed + per second. Note: this does not include any 64-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F64 MFMA operations achievable on the specific accelerator.' + MFMA IOPs (INT8): 'The total number of 8-bit integer MFMA operations executed + per second. Note: this does not include any 8-bit integer operations from VALU + instructions. This is also presented as a percent of the peak theoretical INT8 + MFMA operations achievable on the specific accelerator.' + IPC: The ratio of the total number of instructions executed on the CU over the + total active CU cycles. + IPC (Issued): The ratio of the total number of (non-internal) instructions issued + over the number of cycles where the scheduler was actively working on issuing + instructions. + SALU Utilization: Indicates what percent of the kernel's duration the SALU was + busy executing instructions. Computed as the ratio of the total number of cycles + spent by the scheduler issuing SALU / SMEM instructions over the total CU cycles. + VALU Utilization: Indicates what percent of the kernel's duration the VALU was + busy executing instructions. Does not include VMEM operations. Computed as the + ratio of the total number of cycles spent by the scheduler issuing VALU instructions + over the total CU cycles. + VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit + was busy executing instructions, including both global/generic and spill/scratch + operations (see the VMEM instruction count metrics for more detail). Does not + include VALU operations. Computed as the ratio of the total number of cycles + spent by the scheduler issuing VMEM instructions over the total CU cycles. + Branch Utilization: Indicates what percent of the kernel's duration the branch + unit was busy executing instructions. Computed as the ratio of the total number + of cycles spent by the scheduler issuing branch instructions over the total + CU cycles. + VALU Active Threads: Indicates the average level of divergence within a wavefront + over the lifetime of the kernel. The number of work-items that were active in + a wavefront during execution of each VALU instruction, time-averaged over all + VALU instructions run on all wavefronts in the kernel + MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit + was busy executing instructions. Computed as the ratio of the total number of + cycles spent by the MFMA was busy over the total CU cycles. + MFMA Instruction Cycles: The average duration of MFMA instructions in this kernel + in cycles. Computed as the ratio of the total number of cycles the MFMA unit + was busy over the total number of MFMA instructions. + VMEM Latency: The average number of round-trip cycles (that is, from issue to + data return / acknowledgment) required for a VMEM instruction to complete. + SMEM Latency: The average number of round-trip cycles (that is, from issue to + data return / acknowledgment) required for a SMEM instruction to complete. + FLOPs (Total): The total number of floating-point operations executed on either + the VALU or MFMA units, per normalization unit. + IOPs (Total): The total number of integer operations executed on either the VALU + or MFMA units, per normalization unit. + F16 OPs: The total number of 16-bit floating-point operations executed on either + the VALU or MFMA units, per normalization unit. + BF16 OPs: The total number of 16-bit brain floating-point operations executed + on either the VALU or MFMA units, per normalization unit. + F32 OPs: The total number of 32-bit floating-point operations executed on either + the VALU or MFMA units, per normalization unit. + F64 OPs: The total number of 64-bit floating-point operations executed on either + the VALU or MFMA units, per normalization unit. + INT8 OPs: The total number of 8-bit integer operations executed on either the + VALU or MFMA units, per normalization unit. + data source: + - metric_table: + id: 1101 + title: Compute Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + peak: Peak + pop: Pct of Peak + metric: + VALU FLOPs: + value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) + / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) + VALU IOPs: + value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp + - Start_Timestamp))) + unit: GIOP + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp + - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) + MFMA FLOPs (F8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + MFMA FLOPs (BF16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) + MFMA FLOPs (F16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) + MFMA FLOPs (F32): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + MFMA FLOPs (F64): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + MFMA IOPs (INT8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GIOP + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + - metric_table: + id: 1102 + title: Pipeline Statistics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + IPC: + avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + unit: Instr/cycle + IPC (Issued): + avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) + / SQ_ACTIVE_INST_ANY)) + min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) + / SQ_ACTIVE_INST_ANY)) + max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) + / SQ_ACTIVE_INST_ANY)) + unit: Instr/cycle + SALU Utilization: + avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + VALU Utilization: + avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + VMEM Utilization: + avg: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + min: MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + max: MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + unit: pct + Branch Utilization: + avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + min: MIN((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + max: MAX((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + VALU Active Threads: + avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + unit: Threads + MFMA Utilization: + avg: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) + min: MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) + max: MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) + unit: pct + MFMA Instruction Cycles: + avg: AVG(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA != + 0) else None)) + min: MIN(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA != + 0) else None)) + max: MAX(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA != + 0) else None)) + unit: cycles/instr + VMEM Latency: + avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) + else None)) + min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) + else None)) + max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) + else None)) + unit: Cycles + coll_level: SQ_INST_LEVEL_VMEM + SMEM Latency: + avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) + else None)) + min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) + else None)) + max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) + else None)) + unit: Cycles + coll_level: SQ_INST_LEVEL_SMEM + - metric_table: + id: 1103 + title: Arithmetic Operations + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + FLOPs (Total): + avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + min: MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + max: MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + unit: (OPs + $normUnit) + IOPs (Total): + avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 + * 512)) / $denom) + min: MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 + * 512)) / $denom) + max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 + * 512)) / $denom) + unit: (OPs + $normUnit) + F8 OPs: + avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) + min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) + max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) + unit: (OPs + $normUnit) + F16 OPs: + avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 + * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) + min: MIN(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 + * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) + max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 + * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) + unit: (OPs + $normUnit) + BF16 OPs: + avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) + min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) + max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) + unit: (OPs + $normUnit) + F32 OPs: + avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + / $denom)) + min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + / $denom)) + max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + / $denom)) + unit: (OPs + $normUnit) + F64 OPs: + avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + unit: (OPs + $normUnit) + INT8 OPs: + avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + unit: (OPs + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1200_lds.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1200_lds.yaml deleted file mode 100644 index c687e7c471..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1200_lds.yaml +++ /dev/null @@ -1,118 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1200 - title: Local Data Share (LDS) - data source: - - metric_table: - id: 1201 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - tips: Tips - metric: - Utilization: - value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: Pct of Peak - tips: - Access Rate: - value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: Pct of Peak - tips: - Theoretical Bandwidth (% of Peak): - value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * 0.00128))) - unit: Pct of Peak - tips: - Bank Conflict Rate: - value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - unit: Pct of Peak - tips: - comparable: false # for now - cli_style: simple_bar - - - metric_table: - id: 1202 - title: LDS Stats - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - LDS Instrs: - avg: AVG((SQ_INSTS_LDS / $denom)) - min: MIN((SQ_INSTS_LDS / $denom)) - max: MAX((SQ_INSTS_LDS / $denom)) - unit: (Instr + $normUnit) - tips: - Theoretical Bandwidth: - avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / $denom)) - min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / $denom)) - max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / $denom)) - unit: (Bytes + $normUnit) - tips: - LDS Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)) - unit: Cycles - coll_level: SQ_INST_LEVEL_LDS - tips: - Bank Conflicts/Access: - avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - unit: Conflicts/Access - tips: - Index Accesses: - avg: AVG((SQ_LDS_IDX_ACTIVE / $denom)) - min: MIN((SQ_LDS_IDX_ACTIVE / $denom)) - max: MAX((SQ_LDS_IDX_ACTIVE / $denom)) - unit: (Cycles + $normUnit) - tips: - Atomic Return Cycles: - avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom)) - min: MIN((SQ_LDS_ATOMIC_RETURN / $denom)) - max: MAX((SQ_LDS_ATOMIC_RETURN / $denom)) - unit: (Cycles + $normUnit) - tips: - Bank Conflict: - avg: AVG((SQ_LDS_BANK_CONFLICT / $denom)) - min: MIN((SQ_LDS_BANK_CONFLICT / $denom)) - max: MAX((SQ_LDS_BANK_CONFLICT / $denom)) - unit: (Cycles + $normUnit) - tips: - Addr Conflict: - avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom)) - min: MIN((SQ_LDS_ADDR_CONFLICT / $denom)) - max: MAX((SQ_LDS_ADDR_CONFLICT / $denom)) - unit: (Cycles + $normUnit) - tips: - Unaligned Stall: - avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom)) - min: MIN((SQ_LDS_UNALIGNED_STALL / $denom)) - max: MAX((SQ_LDS_UNALIGNED_STALL / $denom)) - unit: (Cycles + $normUnit) - tips: - Mem Violations: - avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom)) - min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom)) - max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom)) - unit: (Accesses + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1200_local_data_share_lds.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1200_local_data_share_lds.yaml new file mode 100644 index 0000000000..c1a8525348 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1200_local_data_share_lds.yaml @@ -0,0 +1,141 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1200 + title: Local Data Share (LDS) + metrics_description: + Utilization: Indicates what percent of the kernel's duration the LDS was actively + executing instructions (including, but not limited to, load, store, atomic and + HIP's __shfl operations). Calculated as the ratio of the total number of cycles + LDS was active over the total CU cycles. + Access Rate: Indicates the percentage of SIMDs in the VALU actively issuing LDS + instructions, averaged over the lifetime of the kernel. Calculated as the ratio + of the total number of cycles spent by the scheduler issuing LDS instructions + over the total CU cycles. + Theoretical Bandwidth: Indicates the maximum amount of bytes that could have been + loaded from, stored to, or atomically updated in the LDS per normalization unit. + Does not take into account the execution mask of the wavefront when the instruction + was executed. + Bank Conflict Rate: Indicates the percentage of active LDS cycles that were spent + servicing bank conflicts. Calculated as the ratio of LDS cycles spent servicing + bank conflicts over the number of LDS cycles that would have been required to + move the same amount of data in an uncontended access. + LDS Instructions: The total number of LDS instructions (including, but not limited + to, read/write/atomics and HIP's __shfl instructions) executed per normalization + unit. + LDS Latency: The average number of round-trip cycles (i.e., from issue to data-return + / acknowledgment) required for an LDS instruction to complete. + Bank Conflicts/Access: The ratio of the number of cycles spent in the LDS scheduler + due to bank conflicts (as determined by the conflict resolution hardware) to + the base number of cycles that would be spent in the LDS scheduler in a completely + uncontended case. This is the unnormalized form of the Bank Conflict Rate. + Index Accesses: The total number of cycles spent in the LDS scheduler over all + operations per normalization unit. + Atomic Return Cycles: The total number of cycles spent on LDS atomics with return + per normalization unit. + Bank Conflict: The total number of cycles spent in the LDS scheduler due to bank + conflicts (as determined by the conflict resolution hardware) per normalization + unit. + Addr Conflict: The total number of cycles spent in the LDS scheduler due to address + conflicts (as determined by the conflict resolution hardware) per normalization + unit. + Unaligned Stall: The total number of cycles spent in the LDS scheduler due to + stalls from non-dword aligned addresses per normalization unit. + Mem Violations: "The total number of out-of-bounds accesses made to the LDS, per\ + \ normalization unit. This is unused and expected to be zero in most configurations\ + \ for modern CDNA\u2122 accelerators." + data source: + - metric_table: + id: 1201 + title: LDS Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + metric: + Utilization: + value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: Pct of Peak + Access Rate: + value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: Pct of Peak + Theoretical Bandwidth (% of Peak): + value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * 0.00128))) + unit: Pct of Peak + Bank Conflict Rate: + value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + unit: Pct of Peak + comparable: false + cli_style: simple_bar + tui_style: simple_bar + - metric_table: + id: 1202 + title: LDS Statistics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + LDS Instructions: + avg: AVG((SQ_INSTS_LDS / $denom)) + min: MIN((SQ_INSTS_LDS / $denom)) + max: MAX((SQ_INSTS_LDS / $denom)) + unit: (Instr + $normUnit) + Theoretical Bandwidth: + avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / $denom)) + min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / $denom)) + max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / $denom)) + unit: (Bytes + $normUnit) + LDS Latency: + avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else + None)) + min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else + None)) + max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else + None)) + unit: Cycles + coll_level: SQ_INST_LEVEL_LDS + Bank Conflicts/Access: + avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + unit: Conflicts/Access + Index Accesses: + avg: AVG((SQ_LDS_IDX_ACTIVE / $denom)) + min: MIN((SQ_LDS_IDX_ACTIVE / $denom)) + max: MAX((SQ_LDS_IDX_ACTIVE / $denom)) + unit: (Cycles + $normUnit) + Atomic Return Cycles: + avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom)) + min: MIN((SQ_LDS_ATOMIC_RETURN / $denom)) + max: MAX((SQ_LDS_ATOMIC_RETURN / $denom)) + unit: (Cycles + $normUnit) + Bank Conflict: + avg: AVG((SQ_LDS_BANK_CONFLICT / $denom)) + min: MIN((SQ_LDS_BANK_CONFLICT / $denom)) + max: MAX((SQ_LDS_BANK_CONFLICT / $denom)) + unit: (Cycles + $normUnit) + Addr Conflict: + avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom)) + min: MIN((SQ_LDS_ADDR_CONFLICT / $denom)) + max: MAX((SQ_LDS_ADDR_CONFLICT / $denom)) + unit: (Cycles + $normUnit) + Unaligned Stall: + avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom)) + min: MIN((SQ_LDS_UNALIGNED_STALL / $denom)) + max: MAX((SQ_LDS_UNALIGNED_STALL / $denom)) + unit: (Cycles + $normUnit) + Mem Violations: + avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom)) + min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom)) + max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom)) + unit: (Accesses + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1300_instruction-cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1300_instruction-cache.yaml deleted file mode 100644 index 209a42726e..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1300_instruction-cache.yaml +++ /dev/null @@ -1,105 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1300 - title: Instruction Cache - data source: - - metric_table: - id: 1301 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - tips: Tips - metric: - Bandwidth: - value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) - * (End_Timestamp - Start_Timestamp)))) - unit: Pct of Peak - tips: - Cache Hit Rate: - value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - unit: Pct of Peak - tips: - L1I-L2 Bandwidth: - value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu) - * (End_Timestamp - Start_Timestamp)))) - unit: Pct of Peak - tips: - comparable: false # for now - cli_style: simple_bar - - - metric_table: - id: 1302 - title: Instruction Cache Accesses - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Req: - avg: AVG((SQC_ICACHE_REQ / $denom)) - min: MIN((SQC_ICACHE_REQ / $denom)) - max: MAX((SQC_ICACHE_REQ / $denom)) - unit: (Req + $normUnit) - tips: - Hits: - avg: AVG((SQC_ICACHE_HITS / $denom)) - min: MIN((SQC_ICACHE_HITS / $denom)) - max: MAX((SQC_ICACHE_HITS / $denom)) - unit: (Hits + $normUnit) - tips: - Misses - Non Duplicated: - avg: AVG((SQC_ICACHE_MISSES / $denom)) - min: MIN((SQC_ICACHE_MISSES / $denom)) - max: MAX((SQC_ICACHE_MISSES / $denom)) - unit: (Misses + $normUnit) - tips: - Misses - Duplicated: - avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom)) - min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom)) - max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom)) - unit: (Misses + $normUnit) - tips: - Cache Hit Rate: - avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - unit: pct - tips: - Instruction Fetch Latency: - avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - unit: Cycles - coll_level: SQ_IFETCH_LEVEL - tips: - - metric_table: - id: 1303 - title: Instruction Cache - L2 Interface - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - L1I-L2 Bandwidth: - avg: AVG(((SQC_TC_INST_REQ * 64) / $denom)) - min: MIN(((SQC_TC_INST_REQ * 64) / $denom)) - max: MAX(((SQC_TC_INST_REQ * 64) / $denom)) - unit: (Bytes + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1300_instruction_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1300_instruction_cache.yaml new file mode 100644 index 0000000000..a53c23691f --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1300_instruction_cache.yaml @@ -0,0 +1,106 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1300 + title: Instruction Cache + metrics_description: + Bandwidth: The number of bytes looked up in the L1I cache, as a percent of the + peak theoretical bandwidth. Calculated as the ratio of L1I requests over the + total L1I cycles. + Cache Hit Rate: The percent of L1I requests that hit [#l1i-cache]_ on a previously + loaded line the cache. Calculated as the ratio of the number of L1I requests + that hit over the number of all L1I requests. + L1I-L2 Bandwidth: "The percent of the peak theoretical L1I \u2192 L2 cache request\ + \ bandwidth achieved. Calculated as the ratio of the total number of requests\ + \ from the L1I to the L2 cache over the total L1I-L2 interface cycles." + Req: The total number of requests made to the L1I per normalization-unit + Hits: The total number of L1I requests that hit on a previously loaded cache line, + per normalization-unit. + Misses - Non Duplicated: The total number of L1I requests that missed on a cache + line that were not already pending due to another request, per normalization-unit. + Misses - Duplicated: The total number of L1I requests that missed on a cache line + that were already pending due to another request, per normalization-unit. + Instruction Fetch Latency: The average number of cycles spent to fetch instructions + to a CU. + data source: + - metric_table: + id: 1301 + title: L1I Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + metric: + Bandwidth: + value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * (End_Timestamp + - Start_Timestamp)))) + unit: Pct of Peak + Cache Hit Rate: + value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + unit: Pct of Peak + L1I-L2 Bandwidth: + value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu) + * (End_Timestamp - Start_Timestamp)))) + unit: Pct of Peak + comparable: false + cli_style: simple_bar + tui_style: simple_bar + - metric_table: + id: 1302 + title: L1I cache accesses + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Req: + avg: AVG((SQC_ICACHE_REQ / $denom)) + min: MIN((SQC_ICACHE_REQ / $denom)) + max: MAX((SQC_ICACHE_REQ / $denom)) + unit: (Req + $normUnit) + Hits: + avg: AVG((SQC_ICACHE_HITS / $denom)) + min: MIN((SQC_ICACHE_HITS / $denom)) + max: MAX((SQC_ICACHE_HITS / $denom)) + unit: (Hits + $normUnit) + Misses - Non Duplicated: + avg: AVG((SQC_ICACHE_MISSES / $denom)) + min: MIN((SQC_ICACHE_MISSES / $denom)) + max: MAX((SQC_ICACHE_MISSES / $denom)) + unit: (Misses + $normUnit) + Misses - Duplicated: + avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom)) + min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom)) + max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom)) + unit: (Misses + $normUnit) + Cache Hit Rate: + avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + unit: pct + Instruction Fetch Latency: + avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + unit: Cycles + coll_level: SQ_IFETCH_LEVEL + - metric_table: + id: 1303 + title: L1I <-> L2 interface + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + L1I-L2 Bandwidth: + avg: AVG(((SQC_TC_INST_REQ * 64) / $denom)) + min: MIN(((SQC_TC_INST_REQ * 64) / $denom)) + max: MAX(((SQC_TC_INST_REQ * 64) / $denom)) + unit: (Bytes + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1400_constant-cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1400_constant-cache.yaml deleted file mode 100644 index 669a5834b9..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1400_constant-cache.yaml +++ /dev/null @@ -1,171 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1400 - title: Scalar L1 Data Cache - data source: - - metric_table: - id: 1401 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - tips: Tips - metric: - Bandwidth: - value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) - * (End_Timestamp - Start_Timestamp)))) - unit: Pct of Peak - tips: - Cache Hit Rate: - value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES + SQC_DCACHE_MISSES_DUPLICATE)) - if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - unit: Pct of Peak - tips: - sL1D-L2 BW: - value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 100000) - / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp))) - unit: Pct of Peak - tips: - comparable: false # for now - cli_style: simple_bar - - - metric_table: - id: 1402 - title: Scalar L1D Cache Accesses - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Req: - avg: AVG((SQC_DCACHE_REQ / $denom)) - min: MIN((SQC_DCACHE_REQ / $denom)) - max: MAX((SQC_DCACHE_REQ / $denom)) - unit: (Req + $normUnit) - tips: - Hits: - avg: AVG((SQC_DCACHE_HITS / $denom)) - min: MIN((SQC_DCACHE_HITS / $denom)) - max: MAX((SQC_DCACHE_HITS / $denom)) - unit: (Req + $normUnit) - tips: - Misses - Non Duplicated: - avg: AVG((SQC_DCACHE_MISSES / $denom)) - min: MIN((SQC_DCACHE_MISSES / $denom)) - max: MAX((SQC_DCACHE_MISSES / $denom)) - unit: (Req + $normUnit) - tips: - Misses- Duplicated: - avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom)) - min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom)) - max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom)) - unit: (Req + $normUnit) - tips: - Cache Hit Rate: - avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - unit: pct - tips: - Read Req (Total): - avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) - + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) - min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) - + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) - max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) - + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) - unit: (Req + $normUnit) - tips: - Atomic Req: - avg: AVG((SQC_DCACHE_ATOMIC / $denom)) - min: MIN((SQC_DCACHE_ATOMIC / $denom)) - max: MAX((SQC_DCACHE_ATOMIC / $denom)) - unit: (Req + $normUnit) - tips: - Read Req (1 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_1 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_1 / $denom)) - unit: (Req + $normUnit) - tips: - Read Req (2 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_2 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_2 / $denom)) - unit: (Req + $normUnit) - tips: - Read Req (4 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_4 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_4 / $denom)) - unit: (Req + $normUnit) - tips: - Read Req (8 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_8 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_8 / $denom)) - unit: (Req + $normUnit) - tips: - Read Req (16 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_16 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_16 / $denom)) - unit: (Req + $normUnit) - tips: - - - metric_table: - id: 1403 - title: Scalar L1D Cache - L2 Interface - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - sL1D-L2 BW: - avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom)) - min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom)) - max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom)) - unit: (Bytes + $normUnit) - tips: - Read Req: - avg: AVG((SQC_TC_DATA_READ_REQ / $denom)) - min: MIN((SQC_TC_DATA_READ_REQ / $denom)) - max: MAX((SQC_TC_DATA_READ_REQ / $denom)) - unit: (Req + $normUnit) - tips: - Write Req: - avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom)) - min: MIN((SQC_TC_DATA_WRITE_REQ / $denom)) - max: MAX((SQC_TC_DATA_WRITE_REQ / $denom)) - unit: (Req + $normUnit) - tips: - Atomic Req: - avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)) - min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom)) - max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom)) - unit: (Req + $normUnit) - tips: - Stall Cycles: - avg: AVG((SQC_TC_STALL / $denom)) - min: MIN((SQC_TC_STALL / $denom)) - max: MAX((SQC_TC_STALL / $denom)) - unit: (Cycles + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1400_scalar_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1400_scalar_l1_data_cache.yaml new file mode 100644 index 0000000000..d43157ce8e --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1400_scalar_l1_data_cache.yaml @@ -0,0 +1,186 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1400 + title: Scalar L1 Data Cache + metrics_description: + Bandwidth: The number of bytes looked up in the sL1D cache, as a percent of the + peak theoretical bandwidth. Calculated as the ratio of sL1D requests over the + total sL1D cycles. + Cache Hit Rate: Indicates the percent of sL1D requests that hit on a previously + loaded line the cache. The ratio of the number of sL1D requests that hit over + the number of all sL1D requests. + sL1D-L2 BW: "The total number of bytes read from, written to, or atomically updated\ + \ across the sL1D\u2194L2 interface, per normalization unit. Note that sL1D\ + \ writes and atomics are typically unused on current CDNA accelerators, so in\ + \ the majority of cases this can be interpreted as an sL1D\u2192L2 read bandwidth." + Req: The total number of requests, of any size or type, made to the sL1D per normalization + unit. + Hits: The total number of sL1D requests that hit on a previously loaded cache + line, per normalization unit. + Misses - Non Duplicated: 'The total number of sL1D requests that missed on a cache + line that was not already pending due to another request, per normalization + unit. ' + Misses- Duplicated: The total number of sL1D requests that missed on a cache line + that was already pending due to another request, per normalization unit. + Read Req (Total): The total number of sL1D read requests of any size, per normalization + unit. + Atomic Req: The total number of atomic requests from sL1D to the L2, per normalization + unit. Typically unused on current CDNA accelerators. + Read Req (1 DWord): The total number of sL1D read requests made for a single dword + of data (4B), per normalization unit. + Read Req (2 DWord): The total number of sL1D read requests made for a two dwords + of data (8B), per normalization unit. + Read Req (4 DWord): The total number of sL1D read requests made for a four dwords + of data (16B), per normalization unit. + Read Req (8 DWord): The total number of sL1D read requests made for a eight dwords + of data (32B), per normalization unit. + Read Req (16 DWord): The total number of sL1D read requests made for a sixteen + dwords of data (64B), per normalization unit. + Read Req: The total number of read requests from sL1D to the L2 per normalization + unit. + Write Req: The total number of write requests from sL1D to the L2, per normalization + unit. Typically unused on current CDNA accelerators. + Stall Cycles: "The total number of cycles the sL1D\u2194L2 interface was stalled,\ + \ per normalization unit." + data source: + - metric_table: + id: 1401 + title: Scalar L1D Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + metric: + Bandwidth: + value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * (End_Timestamp + - Start_Timestamp)))) + unit: Pct of Peak + Cache Hit Rate: + value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES + + SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + unit: Pct of Peak + sL1D-L2 BW: + value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp))) + unit: Pct of Peak + comparable: false + cli_style: simple_bar + tui_style: simple_bar + - metric_table: + id: 1402 + title: Scalar L1D cache accesses + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Req: + avg: AVG((SQC_DCACHE_REQ / $denom)) + min: MIN((SQC_DCACHE_REQ / $denom)) + max: MAX((SQC_DCACHE_REQ / $denom)) + unit: (Req + $normUnit) + Hits: + avg: AVG((SQC_DCACHE_HITS / $denom)) + min: MIN((SQC_DCACHE_HITS / $denom)) + max: MAX((SQC_DCACHE_HITS / $denom)) + unit: (Req + $normUnit) + Misses - Non Duplicated: + avg: AVG((SQC_DCACHE_MISSES / $denom)) + min: MIN((SQC_DCACHE_MISSES / $denom)) + max: MAX((SQC_DCACHE_MISSES / $denom)) + unit: (Req + $normUnit) + Misses- Duplicated: + avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom)) + min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom)) + max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom)) + unit: (Req + $normUnit) + Cache Hit Rate: + avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + unit: pct + Read Req (Total): + avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) + min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) + max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG((SQC_DCACHE_ATOMIC / $denom)) + min: MIN((SQC_DCACHE_ATOMIC / $denom)) + max: MAX((SQC_DCACHE_ATOMIC / $denom)) + unit: (Req + $normUnit) + Read Req (1 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_1 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_1 / $denom)) + unit: (Req + $normUnit) + Read Req (2 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_2 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_2 / $denom)) + unit: (Req + $normUnit) + Read Req (4 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_4 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_4 / $denom)) + unit: (Req + $normUnit) + Read Req (8 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_8 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_8 / $denom)) + unit: (Req + $normUnit) + Read Req (16 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_16 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_16 / $denom)) + unit: (Req + $normUnit) + - metric_table: + id: 1403 + title: Scalar L1D Cache - L2 Interface + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + sL1D-L2 BW: + avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 64)) / $denom)) + min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 64)) / $denom)) + max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 64)) / $denom)) + unit: (Bytes + $normUnit) + Read Req: + avg: AVG((SQC_TC_DATA_READ_REQ / $denom)) + min: MIN((SQC_TC_DATA_READ_REQ / $denom)) + max: MAX((SQC_TC_DATA_READ_REQ / $denom)) + unit: (Req + $normUnit) + Write Req: + avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom)) + min: MIN((SQC_TC_DATA_WRITE_REQ / $denom)) + max: MAX((SQC_TC_DATA_WRITE_REQ / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)) + min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom)) + max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom)) + unit: (Req + $normUnit) + Stall Cycles: + avg: AVG((SQC_TC_STALL / $denom)) + min: MIN((SQC_TC_STALL / $denom)) + max: MAX((SQC_TC_STALL / $denom)) + unit: (Cycles + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1500_TA_and_TD.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1500_TA_and_TD.yaml deleted file mode 100644 index 8994d0b17d..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1500_TA_and_TD.yaml +++ /dev/null @@ -1,174 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1500 - title: Address Processing Unit and Data Return Path (TA/TD) - data source: - - metric_table: - id: 1501 - title: Address Processing Unit - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Address Processing Unit Busy: - avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Address Stall: - avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Data Stall: - avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Data-Processor → Address Stall: - avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Total Instructions: - avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom)) - min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom)) - max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Global/Generic Instructions: - avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Global/Generic Read Instructions: - avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Global/Generic Write Instructions: - avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Global/Generic Atomic Instructions: - avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Spill/Stack Instructions: - avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Spill/Stack Read Instructions: - avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Spill/Stack Write Instructions: - avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Spill/Stack Atomic Instructions: - avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Spill/Stack Total Cycles: - avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) - min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) - max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) - unit: (Cycles + $normUnit) - tips: - Spill/Stack Coalesced Read: - avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) - min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) - max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) - unit: (Cycles + $normUnit) - tips: - Spill/Stack Coalesced Write: - avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) - min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) - max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) - unit: (Cycles + $normUnit) - tips: - - - metric_table: - id: 1502 - title: Data-Return Path - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Data-Return Busy: - avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Cache RAM → Data-Return Stall: - avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Workgroup manager → Data-Return Stall: - avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Coalescable Instructions: - avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom)) - min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom)) - max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Read Instructions: - avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) - / $denom)) - min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) - / $denom)) - max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) - / $denom)) - unit: (Instructions + $normUnit) - tips: - Write Instructions: - avg: AVG((TD_STORE_WAVEFRONT_sum / $denom)) - min: MIN((TD_STORE_WAVEFRONT_sum / $denom)) - max: MAX((TD_STORE_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Atomic Instructions: - avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom)) - min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom)) - max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1500_address_processing_unit_and_data_return_path_ta_td.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1500_address_processing_unit_and_data_return_path_ta_td.yaml new file mode 100644 index 0000000000..f920234926 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1500_address_processing_unit_and_data_return_path_ta_td.yaml @@ -0,0 +1,248 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1500 + title: Address Processing Unit and Data Return Path (TA/TD) + metrics_description: + Address Processing Unit Busy: Percent of the total CU cycles the address processor + was busy + Address Stall: Percent of the total CU cycles the address processor was stalled + from sending address requests further into the vL1D pipeline. + Data Stall: Percent of the total CU cycles the address processor was stalled from + sending write/atomic data further into the vL1D pipeline. + "Data-Processor \u2192 Address Stall": Percent of total CU cycles the address + processor was stalled waiting to send command data to the data processor. + Total Instructions: The total number of memory instructions executed by the address + processer over all compute units on the accelerator, per normalization unit. + Global/Generic Instructions: The total number of global & generic memory instructions + executed on all compute units on the accelerator, per normalization unit. + Global/Generic Read Instructions: The total number of global & generic memory + read instructions executed on all compute units on the accelerator, per normalization + unit. + Global/Generic Write Instructions: The total number of global & generic memory + write instructions executed on all compute units on the accelerator, per normalization + unit. + Global/Generic Atomic Instructions: The total number of global & generic memory + atomic (with and without return) instructions executed on all compute units + on the accelerator, per normalization unit. + Spill/Stack Instructions: The total number of spill/stack memory instructions + executed on all compute units on the accelerator, per normalization unit. + Spill/Stack Read Instructions: The total number of spill/stack memory read instructions + executed on all compute units on the accelerator, per normalization unit. + Spill/Stack Write Instructions: The total number of spill/stack memory write instructions + executed on all compute units on the accelerator, per normalization unit. + Spill/Stack Atomic Instructions: The total number of spill/stack memory atomic + (with and without return) instructions executed on all compute units on the + accelerator, per normalization unit. Typically unused as these memory operations + are typically used to implement thread-local storage. + Spill/Stack Total Cycles: The number of cycles the address processing unit spent + working on spill/stack instructions, per normalization unit. + Spill/Stack Coalesced Read: The number of cycles the address processing unit spent + working on coalesced spill/stack read instructions, per normalization unit. + Spill/Stack Coalesced Write: The number of cycles the address processing unit + spent working on coalesced spill/stack write instructions, per normalization + unit. + Data-Return Busy: Percent of the total CU cycles the data-return unit was busy + processing or waiting on data to return to the CU. + "Cache RAM \u2192 Data-Return Stall": Percent of the total CU cycles the data-return + unit was stalled on data to be returned from the vL1D Cache RAM. + "Workgroup manager \u2192 Data-Return Stall": Percent of the total CU cycles the + data-return unit was stalled by the workgroup manager due to initialization + of registers as a part of launching new workgroups. + Coalescable Instructions: The number of instructions submitted to the data-return + unit by the address processor that were found to be coalescable, per normalization + unit. + Read Instructions: The number of read instructions submitted to the data-return + unit by the address processor summed over all compute units on the accelerator, + per normalization unit. This is expected to be the sum of global/generic and + spill/stack reads in the address processor. + Write Instructions: The number of store instructions submitted to the data-return + unit by the address processor summed over all compute units on the accelerator, + per normalization unit. This is expected to be the sum of global/generic and + spill/stack stores in the address processor. + Atomic Instructions: The number of atomic instructions submitted to the data-return + unit by the address processor summed over all compute units on the accelerator, + per normalization unit. This is expected to be the sum of global/generic and + spill/stack atomics in the address processor. + data source: + - metric_table: + id: 1501 + title: Busy and stall metrics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Address Processing Unit Busy: + avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + Address Stall: + avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + unit: pct + Data Stall: + avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + unit: pct + "Data-Processor \u2192 Address Stall": + avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + unit: pct + "Sequencer \u2192 TA Address Stall": + avg: AVG((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) + min: MIN((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) + max: MAX((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) + "Sequencer \u2192 TA Command Stall": + avg: AVG((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) + min: MIN((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) + max: MAX((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) + "Sequencer \u2192 TA Data Stall": + avg: AVG((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) + min: MIN((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) + max: MAX((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) + - metric_table: + id: 1502 + title: Instruction counts + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Total Instructions: + avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom)) + min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom)) + max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Instructions: + avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Read Instructions: + avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Write Instructions: + avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Atomic Instructions: + avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Instructions: + avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Read Instructions: + avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Write Instructions: + avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Atomic Instructions: + avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + - metric_table: + id: 1503 + title: Spill and stack metrics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Spill/Stack Total Cycles: + avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) + min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) + max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) + unit: (Cycles + $normUnit) + Spill/Stack Coalesced Read: + avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) + min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) + max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) + unit: (Cycles + $normUnit) + Spill/Stack Coalesced Write: + avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) + min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) + max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) + unit: (Cycles + $normUnit) + - metric_table: + id: 1504 + title: Vector L1 data-return path or Texture Data (TD) + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Data-Return Busy: + avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + "Cache RAM \u2192 Data-Return Stall": + avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + "Workgroup manager \u2192 Data-Return Stall": + avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + Coalescable Instructions: + avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom)) + min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom)) + max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) + Read Instructions: + avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) + / $denom)) + min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) + / $denom)) + max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) + / $denom)) + unit: (Instructions + $normUnit) + Write Instructions: + avg: AVG((TD_STORE_WAVEFRONT_sum / $denom)) + min: MIN((TD_STORE_WAVEFRONT_sum / $denom)) + max: MAX((TD_STORE_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) + Atomic Instructions: + avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom)) + min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom)) + max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1600_L1_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1600_L1_cache.yaml deleted file mode 100644 index 7fabcfdb47..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1600_L1_cache.yaml +++ /dev/null @@ -1,387 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1600 - title: Vector L1 Data Cache - data source: - - metric_table: - id: 1601 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - tips: Tips - metric: - Hit rate: - value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - unit: Pct of Peak - tips: - Bandwidth: - value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) - unit: Pct of Peak - tips: - Utilization: - value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None)) - unit: Pct of Peak - tips: - Coalescing: - value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum - * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None)) - unit: Pct of Peak - tips: - comparable: false # for now - cli_style: simple_bar - - - metric_table: - id: 1602 - title: L1D Cache Stalls (%) - header: - metric: Metric - expr: Expression - tips: Tips - metric: - Stalled on L2 Data: - expr: - (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None) - tips: - Stalled on L2 Req: - expr: - (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None) - tips: - Stalled on Address: - expr: - None - tips: - Stalled on Data: - expr: - None - tips: - Stalled on Latency FIFO: - expr: - None - tips: - Stalled on Request FIFO: - expr: - None - tips: - Stalled on Read Return: - expr: - None - tips: - Tag RAM Stall (Read): - expr: - (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None) - tips: - Tag RAM Stall (Write): - expr: - (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None) - tips: - Tag RAM Stall (Atomic): - expr: - (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None) - tips: - cli_style: simple_box - - - metric_table: - id: 1603 - title: L1D Cache Accesses - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Total Req: - avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom)) - min: MIN((TCP_TOTAL_ACCESSES_sum / $denom)) - max: MAX((TCP_TOTAL_ACCESSES_sum / $denom)) - unit: (Req + $normUnit) - tips: - Read Req: - avg: AVG((TCP_TOTAL_READ_sum / $denom)) - min: MIN((TCP_TOTAL_READ_sum / $denom)) - max: MAX((TCP_TOTAL_READ_sum / $denom)) - unit: (Req + $normUnit) - tips: - Write Req: - avg: AVG((TCP_TOTAL_WRITE_sum / $denom)) - min: MIN((TCP_TOTAL_WRITE_sum / $denom)) - max: MAX((TCP_TOTAL_WRITE_sum / $denom)) - unit: (Req + $normUnit) - tips: - Atomic Req: - avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)) - min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)) - max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)) - unit: (Req + $normUnit) - tips: - Cache BW: - avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / $denom)) - min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / $denom)) - max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / $denom)) - unit: (Bytes + $normUnit) - tips: - Cache Hit Rate: - avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / - TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / - TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / - TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - unit: pct - tips: - Cache Accesses: - avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) - min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) - max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) - unit: (Req + $normUnit) - tips: - Cache Hits: - avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / $denom)) - min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / $denom)) - max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / $denom)) - unit: (Req + $normUnit) - tips: - Invalidations: - avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) - min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) - max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) - unit: (Req + $normUnit) - tips: - L1-L2 BW: - avg: AVG(((128 * TCP_TCC_READ_REQ_sum + 64 * - (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + - TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) - min: MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * - (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + - TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) - max: MAX(((128 * TCP_TCC_READ_REQ_sum + 64 * - (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + - TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) - unit: (Bytes + $normUnit) - tips: - L1-L2 Read: - avg: AVG((TCP_TCC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - L1-L2 Write: - avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - L1-L2 Atomic: - avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)) - min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)) - max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)) - unit: (Req + $normUnit) - tips: - - - metric_table: - id: 1604 - title: L1D - L2 Transactions - header: - metric: Metric - xfer: Xfer - coherency: Coherency - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - NC - Read: - xfer: Read - coherency: NC - avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - UC - Read: - xfer: Read - coherency: UC - avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - CC - Read: - xfer: Read - coherency: CC - avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - RW - Read: - xfer: Read - coherency: RW - avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - RW - Write: - xfer: Write - coherency: RW - avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - NC - Write: - xfer: Write - coherency: NC - avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - UC - Write: - xfer: Write - coherency: UC - avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - CC - Write: - xfer: Write - coherency: CC - avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - NC - Atomic: - xfer: Atomic - coherency: NC - avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - UC - Atomic: - xfer: Atomic - coherency: UC - avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - CC - Atomic: - xfer: Atomic - coherency: CC - avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - RW - Atomic: - xfer: Atomic - coherency: RW - avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - - - metric_table: - id: 1605 - title: L1D Addr Translation - header: - metric: Metric - avg: Avg - min: Min - max: Max - units: Units - tips: Tips - metric: - Req: - avg: AVG((TCP_UTCL1_REQUEST_sum / $denom)) - min: MIN((TCP_UTCL1_REQUEST_sum / $denom)) - max: MAX((TCP_UTCL1_REQUEST_sum / $denom)) - units: (Req + $normUnit) - tips: - Inflight Req: - avg: None # Missing perfmon - min: None # Missing perfmon - max: None # Missing perfmon - units: (Req + $normUnit) - tips: - Hit Ratio: - avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) if - (TCP_UTCL1_REQUEST_sum != 0) else None)) - min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) if - (TCP_UTCL1_REQUEST_sum != 0) else None)) - max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) if - (TCP_UTCL1_REQUEST_sum != 0) else None)) - units: pct - tips: - Hits: - avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) - min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) - max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) - units: (Req + $normUnit) - tips: - Translation Misses: - avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) - min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) - max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) - units: (Req + $normUnit) - tips: - Permission Misses: - avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) - min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) - max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) - units: (Req + $normUnit) - tips: - - metric_table: - id: 1606 - title: L1D Addr Translation Stalls - header: - metric: Metric - avg: Avg - min: Min - max: Max - units: Units - tips: Tips - metric: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1600_vector_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1600_vector_l1_data_cache.yaml new file mode 100644 index 0000000000..708bbafe14 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1600_vector_l1_data_cache.yaml @@ -0,0 +1,412 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1600 + title: Vector L1 Data Cache + metrics_description: + Hit rate: The ratio of the number of vL1D cache line requests that hit in vL1D + cache over the total number of cache line requests to the vL1D Cache RAM. + Bandwidth: The number of bytes looked up in the vL1D cache as a result of VMEM + instructions, as a percent of the peak theoretical bandwidth achievable on the + specific accelerator. The number of bytes is calculated as the number of cache + lines requested multiplied by the cache line size. This value does not consider + partial requests, so for instance, if only a single value is requested in a + cache line, the data movement will still be counted as a full cache line. + Utilization: Indicates how busy the vL1D Cache RAM was during the kernel execution. + The number of cycles where the vL1D Cache RAM is actively processing any request + divided by the number of cycles where the vL1D is active. + Coalescing: Indicates how well memory instructions were coalesced by the address + processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). Calculated + as the average number of thread-requests generated per instruction divided by + the ideal number of thread-requests per instruction. + Stalled on L2 Data: The ratio of the number of cycles where the vL1D is stalled + waiting for requested data to return from the L2 cache divided by the number + of cycles where the vL1D is active. + Stalled on L2 Req: The ratio of the number of cycles where the vL1D is stalled + waiting to issue a request for data to the L2 cache divided by the number of + cycles where the vL1D is active. + Tag RAM Stall (Read): The ratio of the number of cycles where the vL1D is stalled + due to Read requests with conflicting tags being looked up concurrently, divided + by the number of cycles where the vL1D is active. + Tag RAM Stall (Write): The ratio of the number of cycles where the vL1D is stalled + due to Write requests with conflicting tags being looked up concurrently, divided + by the number of cycles where the vL1D is active. + Tag RAM Stall (Atomic): The ratio of the number of cycles where the vL1D is stalled + due to Atomic requests with conflicting tags being looked up concurrently, divided + by the number of cycles where the vL1D is active. + Total Req: The total number of incoming requests from the address processing unit + after coalescing. + Read Req: The total number of incoming read requests from the address processing + unit after coalescing per normalization unit. + Write Req: The total number of incoming write requests from the address processing + unit after coalescing per normalization unit. + Atomic Req: The total number of incoming atomic requests from the address processing + unit after coalescing per normalization unit. + Cache BW: The number of bytes looked up in the vL1D cache as a result of VMEM + instructions per normalization unit. The number of bytes is calculated as the + number of cache lines requested multiplied by the cache line size. This value + does not consider partial requests, so for instance, if only a single value + is requested in a cache line, the data movement will still be counted as a full + cache line. + Cache Hit Rate: The ratio of the number of vL1D cache line requests that hit in + vL1D cache over the total number of cache line requests to the vL1D Cache RAM. + Cache Accesses: The total number of cache line lookups in the vL1D. + Cache Hits: The number of cache accesses minus the number of outgoing requests + to the L2 cache, that is, the number of cache line requests serviced by the + vL1D Cache RAM per normalization unit. + Invalidations: The number of times the vL1D was issued a write-back invalidate + command during the kernel's execution per normalization unit. This may be triggered + by, for instance, the buffer_wbinvl1 instruction. + L1-L2 BW: The number of bytes transferred across the vL1D-L2 interface as a result + of VMEM instructions, per normalization unit. The number of bytes is calculated + as the number of cache lines requested multiplied by the cache line size. This + value does not consider partial requests, so for instance, if only a single + value is requested in a cache line, the data movement will still be counted + as a full cache line. + L1-L2 Read: The number of read requests for a vL1D cache line that were not satisfied + by the vL1D and must be retrieved from the to the L2 Cache per normalization + unit. + L1-L2 Write: The number of write requests to a vL1D cache line that were sent + through the vL1D to the L2 cache, per normalization unit. + L1-L2 Atomic: The number of atomic requests that are sent through the vL1D to + the L2 cache, per normalization unit. This includes requests for atomics with, + and without return. + L1 Access Latency: Calculated as the average number of cycles that a vL1D cache + line request spent in the vL1D cache pipeline. + L1-L2 Read Latency: Calculated as the average number of cycles that the vL1D cache + took to issue and receive read requests from the L2 Cache. This number also + includes requests for atomics with return values. + L1-L2 Write Latency: Calculated as the average number of cycles that the vL1D + cache took to issue and receive acknowledgement of a write request to the L2 + Cache. This number also includes requests for atomics without return values. + NC - Read: Total read requests with NC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + UC - Read: Total read requests with UC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + CC - Read: Total read requests with CC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + RW - Read: Total read requests with RW mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + RW - Write: Total write requests with RW mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + NC - Write: Total write requests with NC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + UC - Write: Total write requests with UC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + CC - Write: Total write requests with CC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + NC - Atomic: Total atomic requests with NC mtype from this TCP to all TCCs Sum + over TCP instances per normalization unit. + UC - Atomic: Total atomic requests with UC mtype from this TCP to all TCCs Sum + over TCP instances per normalization unit. + CC - Atomic: Total atomic requests with CC mtype from this TCP to all TCCs Sum + over TCP instances per normalization unit. + RW - Atomic: Total atomic requests with RW mtype from this TCP to all TCCs Sum + over TCP instances per normalization unit. + Req: The number of translation requests made to the UTCL1 per normalization unit. + Hit Ratio: The ratio of the number of translation requests that hit in the UTCL1 + divided by the total number of translation requests made to the UTCL1. + Hits: The number of translation requests that hit in the UTCL1, and could be reused, + per normalization unit. + Translation Misses: The total number of translation requests that missed in the + UTCL1 due to translation not being present in the cache, per normalization + unit. + Permission Misses: "The total number of translation requests that missed in the\ + \ UTCL1 due to a permission error, per normalization unit. This is unused and\ + \ expected to be zero in most configurations for modern CDNA\u2122 accelerators." + data source: + - metric_table: + id: 1601 + title: vL1D Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + metric: + Hit rate: + value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None)) + unit: Pct of Peak + Bandwidth: + value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) + unit: Pct of Peak + Utilization: + value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None)) + unit: Pct of Peak + Coalescing: + value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum + * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None)) + unit: Pct of Peak + comparable: false + cli_style: simple_bar + tui_style: simple_bar + - metric_table: + id: 1602 + title: vL1D cache stall metrics + header: + metric: Metric + expr: Expression + metric: + Stalled on L2 Data: + expr: (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None) + Stalled on L2 Req: + expr: (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None) + Tag RAM Stall (Read): + expr: (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + Tag RAM Stall (Write): + expr: (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + Tag RAM Stall (Atomic): + expr: (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1603 + title: vL1D cache access metrics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Total Req: + avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom)) + min: MIN((TCP_TOTAL_ACCESSES_sum / $denom)) + max: MAX((TCP_TOTAL_ACCESSES_sum / $denom)) + unit: (Req + $normUnit) + Read Req: + avg: AVG((TCP_TOTAL_READ_sum / $denom)) + min: MIN((TCP_TOTAL_READ_sum / $denom)) + max: MAX((TCP_TOTAL_READ_sum / $denom)) + unit: (Req + $normUnit) + Write Req: + avg: AVG((TCP_TOTAL_WRITE_sum / $denom)) + min: MIN((TCP_TOTAL_WRITE_sum / $denom)) + max: MAX((TCP_TOTAL_WRITE_sum / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + unit: (Req + $normUnit) + Cache BW: + avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / $denom)) + min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / $denom)) + max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / $denom)) + unit: (Bytes + $normUnit) + Cache Hit Rate: + avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None)) + min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None)) + max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None)) + unit: pct + Cache Accesses: + avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + unit: (Req + $normUnit) + Cache Hits: + avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + unit: (Req + $normUnit) + Invalidations: + avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + unit: (Req + $normUnit) + L1-L2 BW: + avg: AVG(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) + min: MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) + max: MAX(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) + unit: (Bytes + $normUnit) + L1-L2 Read: + avg: AVG((TCP_TCC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + L1-L2 Write: + avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + L1-L2 Atomic: + avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + unit: (Req + $normUnit) + - metric_table: + id: 1604 + title: L1D - L2 Transactions + header: + metric: Metric + xfer: Xfer + coherency: Coherency + avg: Avg + min: Min + max: Max + unit: Unit + metric: + NC - Read: + xfer: Read + coherency: NC + avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC - Read: + xfer: Read + coherency: UC + avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC - Read: + xfer: Read + coherency: CC + avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW - Read: + xfer: Read + coherency: RW + avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW - Write: + xfer: Write + coherency: RW + avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + NC - Write: + xfer: Write + coherency: NC + avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC - Write: + xfer: Write + coherency: UC + avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC - Write: + xfer: Write + coherency: CC + avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + NC - Atomic: + xfer: Atomic + coherency: NC + avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC - Atomic: + xfer: Atomic + coherency: UC + avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC - Atomic: + xfer: Atomic + coherency: CC + avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW - Atomic: + xfer: Atomic + coherency: RW + avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + - metric_table: + id: 1605 + title: L1 Unified Translation Cache (UTCL1) + header: + metric: Metric + avg: Avg + min: Min + max: Max + units: Units + metric: + Req: + avg: AVG((TCP_UTCL1_REQUEST_sum / $denom)) + min: MIN((TCP_UTCL1_REQUEST_sum / $denom)) + max: MAX((TCP_UTCL1_REQUEST_sum / $denom)) + units: (Req + $normUnit) + Hit Ratio: + avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) + if (TCP_UTCL1_REQUEST_sum != 0) else None)) + min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) + if (TCP_UTCL1_REQUEST_sum != 0) else None)) + max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) + if (TCP_UTCL1_REQUEST_sum != 0) else None)) + units: pct + Hits: + avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + units: (Req + $normUnit) + Translation Misses: + avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + units: (Req + $normUnit) + Permission Misses: + avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + units: (Req + $normUnit) + - metric_table: + id: 1606 + title: L1D Addr Translation Stalls + header: + metric: Metric + avg: Avg + min: Min + max: Max + units: Units + metric: {} diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1700_L2_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1700_L2_cache.yaml deleted file mode 100644 index 5ac54a12a0..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1700_L2_cache.yaml +++ /dev/null @@ -1,391 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1700 - title: L2 Cache - data source: - - metric_table: - id: 1701 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - tips: Tips - metric: - Utilization: - value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD))) - unit: pct - tips: - Bandwidth: - value: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) - unit: pct - tips: - Hit Rate: - value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else 0)) - unit: pct - tips: - L2-Fabric Read BW: - value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) - * 64)) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - tips: - L2-Fabric Write and Atomic BW: - value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - tips: - HBM Bandwidth: - value: $hbmBandwidth - unit: GB/s - tips: - - - metric_table: - id: 1702 - title: L2 - Fabric Transactions - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Read BW: - avg: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) - * 64)) / $denom)) - min: MIN((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) - * 64)) / $denom)) - max: MAX((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) - * 64)) / $denom)) - unit: (Bytes + $normUnit) - tips: - HBM Read Traffic: - avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - unit: pct - tips: - Remote Read Traffic: - avg: AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - min: MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - max: MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - unit: pct - tips: - Uncached Read Traffic: - avg: AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - unit: pct - tips: - Write and Atomic BW: - avg: - AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / $denom)) - min: - MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / $denom)) - max: - MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / $denom)) - unit: (Bytes + $normUnit) - tips: - HBM Write and Atomic Traffic: - avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - unit: pct - tips: - Remote Write and Atomic Traffic: - avg: AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - min: MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - max: MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - unit: pct - tips: - Atomic Traffic: - avg: AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - unit: pct - tips: - Uncached Write and Atomic Traffic: - avg: AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - unit: pct - tips: - Read Latency: - avg: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != - 0) else None)) - min: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != - 0) else None)) - max: MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != - 0) else None)) - unit: Cycles - tips: - Write and Atomic Latency: - avg: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != - 0) else None)) - min: MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != - 0) else None)) - max: MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != - 0) else None)) - unit: Cycles - tips: - Atomic Latency: - avg: AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum - != 0) else None)) - min: MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum - != 0) else None)) - max: MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum - != 0) else None)) - unit: Cycles - tips: - - - metric_table: - id: 1703 - title: L2 Cache Accesses - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Bandwidth: - avg: AVG((TCC_REQ_sum * 128) / $denom) - min: MIN((TCC_REQ_sum * 128) / $denom) - max: MAX((TCC_REQ_sum * 128) / $denom) - unit: (Bytes + $normUnit) - tips: - Req: - avg: AVG((TCC_REQ_sum / $denom)) - min: MIN((TCC_REQ_sum / $denom)) - max: MAX((TCC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - Read Req: - avg: AVG((TCC_READ_sum / $denom)) - min: MIN((TCC_READ_sum / $denom)) - max: MAX((TCC_READ_sum / $denom)) - unit: (Req + $normUnit) - tips: - Write Req: - avg: AVG((TCC_WRITE_sum / $denom)) - min: MIN((TCC_WRITE_sum / $denom)) - max: MAX((TCC_WRITE_sum / $denom)) - unit: (Req + $normUnit) - tips: - Atomic Req: - avg: AVG((TCC_ATOMIC_sum / $denom)) - min: MIN((TCC_ATOMIC_sum / $denom)) - max: MAX((TCC_ATOMIC_sum / $denom)) - unit: (Req + $normUnit) - tips: - Streaming Req: - avg: AVG((TCC_STREAMING_REQ_sum / $denom)) - min: MIN((TCC_STREAMING_REQ_sum / $denom)) - max: MAX((TCC_STREAMING_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - Probe Req: - avg: AVG((TCC_PROBE_sum / $denom)) - min: MIN((TCC_PROBE_sum / $denom)) - max: MAX((TCC_PROBE_sum / $denom)) - unit: (Req + $normUnit) - tips: - Cache Hit: - avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - unit: pct - tips: - Hits: - avg: AVG((TCC_HIT_sum / $denom)) - min: MIN((TCC_HIT_sum / $denom)) - max: MAX((TCC_HIT_sum / $denom)) - unit: (Hits + $normUnit) - tips: - Misses: - avg: AVG((TCC_MISS_sum / $denom)) - min: MIN((TCC_MISS_sum / $denom)) - max: MAX((TCC_MISS_sum / $denom)) - unit: (Misses + $normUnit) - tips: - Writeback: - avg: AVG((TCC_WRITEBACK_sum / $denom)) - min: MIN((TCC_WRITEBACK_sum / $denom)) - max: MAX((TCC_WRITEBACK_sum / $denom)) - unit: (Cachelines + $normUnit) - tips: - Writeback (Internal): - avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom)) - min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom)) - max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom)) - unit: (Cachelines + $normUnit) - tips: - Writeback (vL1D Req): - avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - unit: (Cachelines + $normUnit) - tips: - Evict (Internal): - avg: AVG((TCC_NORMAL_EVICT_sum / $denom)) - min: MIN((TCC_NORMAL_EVICT_sum / $denom)) - max: MAX((TCC_NORMAL_EVICT_sum / $denom)) - unit: (Cachelines + $normUnit) - tips: - Evict (vL1D Req): - avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - unit: (Cachelines + $normUnit) - tips: - NC Req: - avg: AVG((TCC_NC_REQ_sum / $denom)) - min: MIN((TCC_NC_REQ_sum / $denom)) - max: MAX((TCC_NC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - UC Req: - avg: AVG((TCC_UC_REQ_sum / $denom)) - min: MIN((TCC_UC_REQ_sum / $denom)) - max: MAX((TCC_UC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - CC Req: - avg: AVG((TCC_CC_REQ_sum / $denom)) - min: MIN((TCC_CC_REQ_sum / $denom)) - max: MAX((TCC_CC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - RW Req: - avg: AVG((TCC_RW_REQ_sum / $denom)) - min: MIN((TCC_RW_REQ_sum / $denom)) - max: MAX((TCC_RW_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - - - metric_table: - id: 1704 - title: L2 Cache Stalls - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - - - metric_table: - id: 1705 - title: L2 - Fabric Interface Stalls - header: - metric: Metric - type: Type - transaction: Transaction - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - style: - type: simple_multi_bar - metric: - Write - Credit Starvation: - type: Credit Starvation - transaction: Write - avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - unit: pct - tips: - - - metric_table: - id: 1706 - title: L2 - Fabric Detailed Transaction Breakdown - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Read (32B): - avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom)) - min: MIN((TCC_EA0_RDREQ_32B_sum / $denom)) - max: MAX((TCC_EA0_RDREQ_32B_sum / $denom)) - unit: (Req + $normUnit) - tips: - Read (64B): - avg: AVG(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) - min: MIN(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) - max: MAX(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) - unit: (Req + $normUnit) - tips: - Read (Uncached): - avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) - min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) - max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) - unit: (Req + $normUnit) - tips: - HBM Read: - avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)) - min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom)) - max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom)) - unit: (Req + $normUnit) - tips: - Remote Read: - avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - unit: (Req + $normUnit) - tips: - Write and Atomic (32B): - avg: AVG(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) - min: MIN(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) - max: MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) - unit: (Req + $normUnit) - tips: - Write and Atomic (Uncached): - avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) - min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) - max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) - unit: (Req + $normUnit) - tips: - Write and Atomic (64B): - avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom)) - min: MIN((TCC_EA0_WRREQ_64B_sum / $denom)) - max: MAX((TCC_EA0_WRREQ_64B_sum / $denom)) - unit: (Req + $normUnit) - tips: - HBM Write and Atomic: - avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)) - min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom)) - max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom)) - unit: (Req + $normUnit) - tips: - Remote Write and Atomic: - avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - unit: (Req + $normUnit) - tips: - Atomic: - avg: AVG((TCC_EA0_ATOMIC_sum / $denom)) - min: MIN((TCC_EA0_ATOMIC_sum / $denom)) - max: MAX((TCC_EA0_ATOMIC_sum / $denom)) - unit: (Req + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1700_l2_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1700_l2_cache.yaml new file mode 100644 index 0000000000..f1fd043df1 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1700_l2_cache.yaml @@ -0,0 +1,536 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1700 + title: L2 Cache + metrics_description: + Utilization: The ratio of the number of cycles an L2 channel was active, summed + over all L2 channels on the accelerator over the total L2 cycles. + Peak Bandwidth: The number of bytes looked up in the L2 cache, as a percent of + the peak theoretical bandwidth achievable on the specific accelerator. The number + of bytes is calculated as the number of cache lines requested multiplied by + the cache line size. This value does not consider partial requests, so e.g., + if only a single value is requested in a cache line, the data movement will + still be counted as a full cache line. + Hit Rate: The ratio of the number of L2 cache line requests that hit in the L2 + cache over the total number of incoming cache line requests to the L2 cache. + L2-Fabric Read BW: The number of bytes read by the L2 over the Infinity Fabric + interface per unit time. + L2-Fabric Write and Atomic BW: The number of bytes sent by the L2 over the Infinity + Fabric interface by write and atomic operations per unit time. + HBM Bandwidth: Maximum theoretical bandwidth of the accelerator's local high-bandwidth + memory (HBM) per unit time. This value is calculated as the number of HBM channels + multiplied by the HBM channel width multiplied by the HBM clock frequency. + Read BW: The total number of bytes read by the L2 cache from Infinity Fabric per + normalization unit. + HBM Read Traffic: The percent of read requests generated by the L2 cache that + are routed to the accelerator's local high-bandwidth memory (HBM). This breakdown + does not consider the size of the request (meaning that 32B and 64B requests + are both counted as a single request), so this metric only approximates the + percent of the L2-Fabric Read bandwidth directed to the local HBM. + Remote Read Traffic: The percent of read requests generated by the L2 cache that + are routed to any memory location other than the accelerator's local high-bandwidth + memory (HBM) - for example, the CPU's DRAM or a remote accelerator's HBM. This + breakdown does not consider the size of the request (meaning that 32B and 64B + requests are both counted as a single request), so this metric only approximates + the percent of the L2-Fabric Read bandwidth directed to a remote location. + Uncached Read Traffic: The percent of read requests generated by the L2 cache + that are reading from an uncached memory allocation. Note, as described in the + request flow section, a single 64B read request is typically counted as two + uncached read requests. So, it is possible for the Uncached Read Traffic to + reach up to 200% of the total number of read requests. This breakdown does not + consider the size of the request (i.e., 32B and 64B requests are both counted + as a single request), so this metric only approximates the percent of the L2-Fabric + read bandwidth directed to an uncached memory location. + Write and Atomic BW: The total number of bytes written by the L2 over Infinity + Fabric by write and atomic operations per normalization unit. Note that on current + CDNA accelerators, such as the MI2XX, requests are only considered atomic by + Infinity Fabric if they are targeted at non-write-cacheable memory, for example, + fine-grained memory allocations or uncached memory allocations on the MI2XX. + HBM Write and Atomic Traffic: The percent of write and atomic requests generated + by the L2 cache that are routed to the accelerator's local high-bandwidth memory + (HBM). This breakdown does not consider the size of the request (meaning that + 32B and 64B requests are both counted as a single request), so this metric only + approximates the percent of the L2-Fabric Write and Atomic bandwidth directed + to the local HBM. Note that on current CDNA accelerators, such as the MI2XX, + requests are only considered atomic by Infinity Fabric if they are targeted + at fine-grained memory allocations or uncached memory allocations. + Remote Write and Atomic Traffic: The percent of read requests generated by the + L2 cache that are routed to any memory location other than the accelerator's + local high-bandwidth memory (HBM) - for example, the CPU's DRAM or a remote + accelerator's HBM. This breakdown does not consider the size of the request + (meaning that 32B and 64B requests are both counted as a single request), so + this metric only approximates the percent of the L2-Fabric Read bandwidth directed + to a remote location. Note that on current CDNA accelerators, such as the MI2XX, + requests are only considered atomic by Infinity Fabric if they are targeted + at fine-grained memory allocations or uncached memory allocations. + Atomic Traffic: The percent of write requests generated by the L2 cache that are + atomic requests to any memory location. This breakdown does not consider the + size of the request (meaning that 32B and 64B requests are both counted as a + single request), so this metric only approximates the percent of the L2-Fabric + Read bandwidth directed to a remote location. Note that on current CDNA accelerators, + such as the MI2XX, requests are only considered atomic by Infinity Fabric if + they are targeted at fine-grained memory allocations or uncached memory allocations. + Uncached Write and Atomic Traffic: The percent of write and atomic requests generated + by the L2 cache that are targeting uncached memory allocations. This breakdown + does not consider the size of the request (meaning that 32B and 64B requests + are both counted as a single request), so this metric only approximates the + percent of the L2-Fabric read bandwidth directed to uncached memory allocations. + Read Latency: The time-averaged number of cycles read requests spent in Infinity + Fabric before data was returned to the L2. + Write and Atomic Latency: The time-averaged number of cycles write requests spent + in Infinity Fabric before a completion acknowledgement was returned to the L2. + Atomic Latency: The time-averaged number of cycles atomic requests spent in Infinity + Fabric before a completion acknowledgement (atomic without return value) or + data (atomic with return value) was returned to the L2. + Bandwidth: The number of bytes looked up in the L2 cache, per normalization unit. + The number of bytes is calculated as the number of cache lines requested multiplied + by the cache line size. This value does not consider partial requests, so for + example, if only a single value is requested in a cache line, the data movement + will still be counted as a full cache line. + Req: The total number of incoming requests to the L2 from all clients for all + request types, per normalization unit. + Read Req: The total number of read requests to the L2 from all clients. + Write Req: The total number of write requests to the L2 from all clients. + Atomic Req: The total number of atomic requests (with and without return) to the + L2 from all clients. + Streaming Req: The total number of incoming requests to the L2 that are marked + as streaming. The exact meaning of this may differ depending on the targeted + accelerator, however on an MI2XX this corresponds to non-temporal load or stores. + The L2 cache attempts to evict streaming requests before normal requests when + the L2 is at capacity. + Probe Req: The number of coherence probe requests made to the L2 cache from outside + the accelerator. On an MI2XX, probe requests may be generated by, for example, + writes to fine-grained device memory or by writes to coarse-grained device memory. + Cache Hit: The ratio of the number of L2 cache line requests that hit in the L2 + cache over the total number of incoming cache line requests to the L2 cache. + Hits: The total number of requests to the L2 from all clients that hit in the + cache. As noted in the Speed-of-Light section, this includes hit-on-miss requests. + Misses: The total number of requests to the L2 from all clients that miss in the + cache. As noted in the Speed-of-Light section, these do not include hit-on-miss + requests. + Writeback: The total number of L2 cache lines written back to memory for any reason. + Write-backs may occur due to user code (such as HIP kernel calls to _threadfence_system + or atomic built-ins) by the command processor's memory acquire/release fences, + or for other internal hardware reasons. + Writeback (Internal): The total number of L2 cache lines written back to memory + for internal hardware reasons, per normalization unit. + Writeback (vL1D Req): The total number of L2 cache lines written back to memory + due to requests initiated by the vL1D cache, per normalization unit. + Evict (Internal): The total number of L2 cache lines evicted from the cache due + to capacity limits, per normalization unit. + Evict (vL1D Req): The total number of L2 cache lines evicted from the cache due + to invalidation requests initiated by the vL1D cache, per normalization unit. + NC Req: The total number of requests to the L2 to Not-hardware-Coherent (NC) memory + allocations, per normalization unit. + UC Req: The total number of requests to the L2 that go to Uncached (UC) memory + allocations. + CC Req: The total number of requests to the L2 that go to Coherently Cacheable + (CC) memory allocations. + RW Req: The total number of requests to the L2 that go to Read-Write coherent + memory (RW) allocations. + Write - Credit Starvation: The number of cycles the L2-Fabric interface was stalled + on write or atomic requests to any memory location because too many write/atomic + requests were currently in flight, as a percent of the total active L2 cycles. + Read (32B): The total number of L2 requests to Infinity Fabric to read 32B of + data from any memory location, per normalization unit. + Read (64B): The total number of L2 requests to Infinity Fabric to read 64B of + data from any memory location, per normalization unit. + Read (Uncached): The total number of L2 requests to Infinity Fabric to read uncached + data from any memory location, per normalization unit. 64B requests for uncached + data are counted as two 32B uncached data requests. + HBM Read: The total number of L2 requests to Infinity Fabric to read 32B or 64B + of data from the accelerator's local HBM, per normalization unit. + Remote Read: The total number of L2 requests to Infinity Fabric to read 32B or + 64B of data from any source other than the accelerator's local HBM, per normalization + unit. + Write and Atomic (32B): The total number of L2 requests to Infinity Fabric to + write or atomically update 32B of data to any memory location, per normalization + unit. + Write and Atomic (Uncached): The total number of L2 requests to Infinity Fabric + to write or atomically update 32B or 64B of uncached data, per normalization + unit. + Write and Atomic (64B): The total number of L2 requests to Infinity Fabric to + write or atomically update 64B of data in any memory location, per normalization + unit. + HBM Write and Atomic: The total number of L2 requests to Infinity Fabric to write + or atomically update 32B or 64B of data in the accelerator's local HBM, per + normalization unit. + Remote Write and Atomic: The total number of L2 requests to Infinity Fabric to + write or atomically update 32B or 64B of data in any memory location other than + the accelerator's local HBM, per normalization unit. + Atomic: The total number of L2 requests to Infinity Fabric to atomically update + 32B or 64B of data in any memory location, per normalization unit. See Request + flow for more detail. Note that on current CDNA accelerators, such as the MI2XX, + requests are only considered atomic by Infinity Fabric if they are targeted + at non-write-cacheable memory, such as fine-grained memory allocations or uncached + memory allocations on the MI2XX. + Read Stall: "The ratio of the total number of cycles the L2-Fabric interface was\ + \ stalled on a read request to any destination (local HBM, remote PCIe\xAE connected\ + \ accelerator or CPU, or remote Infinity Fabric connected accelerator or CPU)\ + \ over the total active L2 cycles." + Write Stall: The ratio of the total number of cycles the L2-Fabric interface was + stalled on a write or atomic request to any destination (local HBM, remote accelerator + or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected + accelerator or CPU) over the total active L2 cycles. + Read - PCIe Stall: The number of cycles the L2-Fabric interface was stalled on + read requests to remote PCIe connected accelerators or CPUs as a percent of + the total active L2 cycles. + Read - Infinity Fabric Stall: The number of cycles the L2-Fabric interface was + stalled on read requests to remote Infinity Fabric connected accelerators or + CPUs as a percent of the total active L2 cycles. + Read - HBM Stall: The number of cycles the L2-Fabric interface was stalled on + read requests to the accelerator's local HBM as a percent of the total active + L2 cycles. + Write - PCIe Stall: The number of cycles the L2-Fabric interface was stalled on + write or atomic requests to remote PCIe connected accelerators or CPUs as a + percent of the total active L2 cycles. + Write - Infinity Fabric Stall: The number of cycles the L2-Fabric interface was + stalled on write or atomic requests to remote Infinity Fabric connected accelerators + or CPUs as a percent of the total active L2 cycles. + Write - HBM Stall: The number of cycles the L2-Fabric interface was stalled on + write or atomic requests to accelerator's local HBM as a percent of the total + active L2 cycles. + data source: + - metric_table: + id: 1701 + title: L2 Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + metric: + Utilization: + value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD))) + unit: pct + Peak Bandwidth: + value: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) + / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) + unit: pct + Hit Rate: + value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else 0)) + unit: pct + L2-Fabric Read BW: + value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) + * 64)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + L2-Fabric Write and Atomic BW: + value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + HBM Bandwidth: + value: $hbmBandwidth + unit: GB/s + - metric_table: + id: 1702 + title: L2-Fabric interface metrics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Read BW: + avg: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) + * 64)) / $denom)) + min: MIN((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) + * 64)) / $denom)) + max: MAX((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) + * 64)) / $denom)) + unit: (Bytes + $normUnit) + HBM Read Traffic: + avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + unit: pct + Remote Read Traffic: + avg: AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) / TCC_EA0_RDREQ_sum) + if (TCC_EA0_RDREQ_sum != 0) else None)) + min: MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) / TCC_EA0_RDREQ_sum) + if (TCC_EA0_RDREQ_sum != 0) else None)) + max: MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) / TCC_EA0_RDREQ_sum) + if (TCC_EA0_RDREQ_sum != 0) else None)) + unit: pct + Uncached Read Traffic: + avg: AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + unit: pct + Write and Atomic BW: + avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + unit: (Bytes + $normUnit) + HBM Write and Atomic Traffic: + avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: pct + Remote Write and Atomic Traffic: + avg: AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) + if (TCC_EA0_WRREQ_sum != 0) else None)) + min: MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) + if (TCC_EA0_WRREQ_sum != 0) else None)) + max: MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) + if (TCC_EA0_WRREQ_sum != 0) else None)) + unit: pct + Atomic Traffic: + avg: AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: pct + Uncached Write and Atomic Traffic: + avg: AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: pct + Read Latency: + avg: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + min: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + max: MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + unit: Cycles + Write and Atomic Latency: + avg: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + min: MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + max: MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: Cycles + Atomic Latency: + avg: AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else None)) + min: MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else None)) + max: MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else None)) + unit: Cycles + - metric_table: + id: 1703 + title: L2 Cache Accesses + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Bandwidth: + avg: AVG((TCC_REQ_sum * 128) / $denom) + min: MIN((TCC_REQ_sum * 128) / $denom) + max: MAX((TCC_REQ_sum * 128) / $denom) + unit: (Bytes + $normUnit) + Req: + avg: AVG((TCC_REQ_sum / $denom)) + min: MIN((TCC_REQ_sum / $denom)) + max: MAX((TCC_REQ_sum / $denom)) + unit: (Req + $normUnit) + Read Req: + avg: AVG((TCC_READ_sum / $denom)) + min: MIN((TCC_READ_sum / $denom)) + max: MAX((TCC_READ_sum / $denom)) + unit: (Req + $normUnit) + Write Req: + avg: AVG((TCC_WRITE_sum / $denom)) + min: MIN((TCC_WRITE_sum / $denom)) + max: MAX((TCC_WRITE_sum / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG((TCC_ATOMIC_sum / $denom)) + min: MIN((TCC_ATOMIC_sum / $denom)) + max: MAX((TCC_ATOMIC_sum / $denom)) + unit: (Req + $normUnit) + Streaming Req: + avg: AVG((TCC_STREAMING_REQ_sum / $denom)) + min: MIN((TCC_STREAMING_REQ_sum / $denom)) + max: MAX((TCC_STREAMING_REQ_sum / $denom)) + unit: (Req + $normUnit) + Probe Req: + avg: AVG((TCC_PROBE_sum / $denom)) + min: MIN((TCC_PROBE_sum / $denom)) + max: MAX((TCC_PROBE_sum / $denom)) + unit: (Req + $normUnit) + Cache Hit: + avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + unit: pct + Hits: + avg: AVG((TCC_HIT_sum / $denom)) + min: MIN((TCC_HIT_sum / $denom)) + max: MAX((TCC_HIT_sum / $denom)) + unit: (Hits + $normUnit) + Misses: + avg: AVG((TCC_MISS_sum / $denom)) + min: MIN((TCC_MISS_sum / $denom)) + max: MAX((TCC_MISS_sum / $denom)) + unit: (Misses + $normUnit) + Writeback: + avg: AVG((TCC_WRITEBACK_sum / $denom)) + min: MIN((TCC_WRITEBACK_sum / $denom)) + max: MAX((TCC_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + Writeback (Internal): + avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom)) + min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom)) + max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + Writeback (vL1D Req): + avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + Evict (Internal): + avg: AVG((TCC_NORMAL_EVICT_sum / $denom)) + min: MIN((TCC_NORMAL_EVICT_sum / $denom)) + max: MAX((TCC_NORMAL_EVICT_sum / $denom)) + unit: (Cachelines + $normUnit) + Evict (vL1D Req): + avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + unit: (Cachelines + $normUnit) + NC Req: + avg: AVG((TCC_NC_REQ_sum / $denom)) + min: MIN((TCC_NC_REQ_sum / $denom)) + max: MAX((TCC_NC_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC Req: + avg: AVG((TCC_UC_REQ_sum / $denom)) + min: MIN((TCC_UC_REQ_sum / $denom)) + max: MAX((TCC_UC_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC Req: + avg: AVG((TCC_CC_REQ_sum / $denom)) + min: MIN((TCC_CC_REQ_sum / $denom)) + max: MAX((TCC_CC_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW Req: + avg: AVG((TCC_RW_REQ_sum / $denom)) + min: MIN((TCC_RW_REQ_sum / $denom)) + max: MAX((TCC_RW_REQ_sum / $denom)) + unit: (Req + $normUnit) + - metric_table: + id: 1704 + title: L2 Cache Stalls + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: {} + - metric_table: + id: 1705 + title: L2 - Fabric Interface stalls + header: + metric: Metric + type: Type + transaction: Transaction + avg: Avg + min: Min + max: Max + unit: Unit + style: + type: simple_multi_bar + metric: + Write - Credit Starvation: + type: Credit Starvation + transaction: Write + avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum + != 0) else None)) + min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum + != 0) else None)) + max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum + != 0) else None)) + unit: pct + - metric_table: + id: 1706 + title: L2 - Fabric interface detailed metrics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Read (32B): + avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom)) + min: MIN((TCC_EA0_RDREQ_32B_sum / $denom)) + max: MAX((TCC_EA0_RDREQ_32B_sum / $denom)) + unit: (Req + $normUnit) + Read (64B): + avg: AVG(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) + min: MIN(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) + max: MAX(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) + unit: (Req + $normUnit) + Read (Uncached): + avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) + max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) + unit: (Req + $normUnit) + HBM Read: + avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)) + min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom)) + max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom)) + unit: (Req + $normUnit) + Remote Read: + avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) + min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) + max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) + unit: (Req + $normUnit) + Write and Atomic (32B): + avg: AVG(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) + min: MIN(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) + max: MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) + unit: (Req + $normUnit) + Write and Atomic (Uncached): + avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) + max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) + unit: (Req + $normUnit) + Write and Atomic (64B): + avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom)) + min: MIN((TCC_EA0_WRREQ_64B_sum / $denom)) + max: MAX((TCC_EA0_WRREQ_64B_sum / $denom)) + unit: (Req + $normUnit) + HBM Write and Atomic: + avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)) + min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom)) + max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom)) + unit: (Req + $normUnit) + Remote Write and Atomic: + avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) + min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) + max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) + unit: (Req + $normUnit) + Atomic: + avg: AVG((TCC_EA0_ATOMIC_sum / $denom)) + min: MIN((TCC_EA0_ATOMIC_sum / $denom)) + max: MAX((TCC_EA0_ATOMIC_sum / $denom)) + unit: (Req + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1800_L2_cache_per_channel.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1800_L2_cache_per_channel.yaml deleted file mode 100644 index 6b16e302cc..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1800_L2_cache_per_channel.yaml +++ /dev/null @@ -1,298 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1800 - title: L2 Cache (per Channel) - data source: - - metric_table: - id: 1801 - title: Aggregate Stats (All channels) - header: - metric: Metric - avg: Avg - std dev: Std Dev - min: Min - max: Max - unit: Unit - tips: Tips - metric: - L2 Cache Hit Rate: - avg: AVG(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) - + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * - TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) - + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 - * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) - / (((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) != 0) else None) - std dev: STD(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) - + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * - TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) - + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 - * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) - / (((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) != 0) else None) - min: MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) - + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * - TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) - + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 - * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) - / (((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) != 0) else None) - max: MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) - + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * - TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) - + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 - * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) - / (((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) != 0) else None) - unit: pct - tips: - # FIXME: other arggr metrics!! - - - metric_table: - id: 1802 - title: L2 Cache Hit Rate (pct) - header: - metric: Channel - expr: Expression - metric: - "::_1": - expr: - (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1] - + TCC_MISS[::_1]) != 0) else None) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_box - - - metric_table: - id: 1803 - title: L2 Requests (per normUnit) - header: - metric: Channel - expr: Expression - metric: - "::_1": - expr: (TO_INT(TCC_REQ[::_1]) / $denom) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_box - - - metric_table: - id: 1804 - title: L2 Requests (per normUnit) - header: - metric: Channel - read req: L2 Read - write req: L2 Write - atomic req: L2 Atomic - metric: - "::_1": - read req: AVG((TO_INT(TCC_READ[::_1]) / $denom)) - write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom)) - atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom)) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_multiple_bar - - - metric_table: - id: 1805 - title: L2-Fabric Requests (per normUnit) - header: - metric: Channel - read req: L2-Fabric Read - write req: L2-Fabric Write and Atomic - atomic req: L2-Fabric Atomic - metric: - "::_1": - read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom)) - write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom)) - atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom)) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_multiple_bar - - # - metric_table: - # id: 1806 - # title: L2-Fabric Latency (Cycles) - # header: - # metric: Metric - # read lat: L2-Fabric Read - # write lat: L2-Fabric Write - # atomic lat: L2-Fabric Atomic - # metric: - # "::_1": - # read lat: - # AVG(((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1] - # != 0) else None)) - # write lat: - # AVG(((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1] - # != 0) else None)) - # atomic lat: - # AVG(((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if - # (TCC_EA0_ATOMIC[::_1] != 0) else 0)) - # placeholder_range: - # "::_1": $total_l2_chan - # cli_style: simple_multiple_bar - - - metric_table: - id: 1806 - title: L2-Fabric Read Latency (Cycles) - header: - metric: Channel - expr: Expression - metric: - "::_1": - expr: - ((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1] - != 0) else None) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_box - - - metric_table: - id: 1807 - title: L2-Fabric Write and Atomic Latency (Cycles) - header: - metric: Channel - expr: Expression - metric: - "::_1": - expr: - ((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1] - != 0) else None) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_box - - - metric_table: - id: 1808 - title: L2-Fabric Atomic Latency (Cycles) - header: - metric: Channel - expr: Expression - metric: - "::_1": - expr: ((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if - (TCC_EA0_ATOMIC[::_1] != 0) else 0) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_box - - - metric_table: - id: 1809 - title: L2-Fabric Read Stall (Cycles per normUnit) - header: - metric: Channel - ea read stall - pcie: L2-Fabric Read Stall (PCIe) - ea read stall - if: L2-Fabric Read Stall (Infinity Fabric™) - ea read stall - hbm: L2-Fabric Read Stall (HBM) - metric: - "::_1": - ea read stall - pcie: None # Missing perfmon - ea read stall - if: None # Missing perfmon - ea read stall - hbm: None # Missing perfmon - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_multiple_bar - - - metric_table: - id: 1810 - title: L2-Fabric Write and Atomic Stall (Cycles per normUnit) - header: - metric: Channel - ea write stall - pcie: L2-Fabric Write Stall (PCIe) - ea write stall - gmi: L2-Fabric Write Stall (Infinity Fabric™) - ea write stall - dram: L2-Fabric Write Stall (HBM) - ea write stall - starve: L2-Fabric Write Starve - metric: - "::_1": - ea write stall - pcie: None # Missing perfmon - ea write stall - if: None # Missing perfmon - ea write stall - hbm: None # Missing perfmon - ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1]) / $denom)) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_multiple_bar - - # - metric_table: - # id: 1811 - # title: L2 Tag Stall (cycles) - # header: - # metric: Metric - # expr: Expression - # metric: - # "::_1": - # expr: TCC_TAG_STALL[::_1] - # placeholder_range: - # "::_1": $total_l2_chan - # cli_style: simple_box - - - metric_table: - id: 1812 - title: L2-Fabric (128B read requests per normUnit) - header: - metric: Channel - expr: Expression - metric: - "::_1": - expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom) - placeholder_range: - "::_1": $total_l2_chan - # tips: Number of 128-byte read requests sent to EA - cli_style: simple_box diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1800_l2_cache_per_channel.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1800_l2_cache_per_channel.yaml new file mode 100644 index 0000000000..849662871e --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1800_l2_cache_per_channel.yaml @@ -0,0 +1,251 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1800 + title: L2 Cache (per Channel) + metrics_description: + L2 Cache Hit Rate: The percent of total number of requests to the L2 from all + clients that hit in the cache. As noted in the Speed-of-Light section, this + includes hit-on-miss requests. + data source: + - metric_table: + id: 1801 + title: Aggregate Stats (All channels) + header: + metric: Metric + avg: Avg + std dev: Std Dev + min: Min + max: Max + unit: Unit + metric: + L2 Cache Hit Rate: + avg: AVG(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 + * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 + * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * + TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) + / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) != 0) else None) + std dev: STD(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 + * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 + * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * + TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) + / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) != 0) else None) + min: MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 + * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 + * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * + TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) + / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) != 0) else None) + max: MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 + * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 + * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * + TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) + / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) != 0) else None) + unit: pct + - metric_table: + id: 1802 + title: L2 Cache Hit Rate (pct) + header: + metric: Channel + expr: Expression + metric: + ::_1: + expr: (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1] + + TCC_MISS[::_1]) != 0) else None) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1803 + title: L2 Requests (per normUnit) + header: + metric: Channel + expr: Expression + metric: + ::_1: + expr: (TO_INT(TCC_REQ[::_1]) / $denom) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1804 + title: L2 Requests (per normUnit) + header: + metric: Channel + read req: L2 Read + write req: L2 Write + atomic req: L2 Atomic + metric: + ::_1: + read req: AVG((TO_INT(TCC_READ[::_1]) / $denom)) + write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom)) + atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom)) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_multiple_bar + tui_style: simple_multiple_bar + - metric_table: + id: 1805 + title: L2-Fabric Requests (per normUnit) + header: + metric: Channel + read req: L2-Fabric Read + write req: L2-Fabric Write and Atomic + atomic req: L2-Fabric Atomic + metric: + ::_1: + read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom)) + write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom)) + atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom)) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_multiple_bar + tui_style: simple_multiple_bar + - metric_table: + id: 1806 + title: L2-Fabric Read Latency (Cycles) + header: + metric: Channel + expr: Expression + metric: + ::_1: + expr: ((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1] + != 0) else None) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1807 + title: L2-Fabric Write and Atomic Latency (Cycles) + header: + metric: Channel + expr: Expression + metric: + ::_1: + expr: ((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1] + != 0) else None) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1808 + title: L2-Fabric Atomic Latency (Cycles) + header: + metric: Channel + expr: Expression + metric: + ::_1: + expr: ((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if (TCC_EA0_ATOMIC[::_1] + != 0) else 0) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1809 + title: L2-Fabric Read Stall (Cycles per normUnit) + header: + metric: Channel + ea read stall - pcie: L2-Fabric Read Stall (PCIe) + ea read stall - if: "L2-Fabric Read Stall (Infinity Fabric\u2122)" + ea read stall - hbm: L2-Fabric Read Stall (HBM) + metric: + ::_1: + ea read stall - pcie: None + ea read stall - if: None + ea read stall - hbm: None + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_multiple_bar + tui_style: simple_multiple_bar + - metric_table: + id: 1810 + title: L2-Fabric Write and Atomic Stall (Cycles per normUnit) + header: + metric: Channel + ea write stall - pcie: L2-Fabric Write Stall (PCIe) + ea write stall - if: "L2-Fabric Write Stall (Infinity Fabric\u2122)" + ea write stall - hbm: L2-Fabric Write Stall (HBM) + ea write stall - starve: L2-Fabric Write Starve + metric: + ::_1: + ea write stall - pcie: None + ea write stall - if: None + ea write stall - hbm: None + ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1]) + / $denom)) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_multiple_bar + tui_style: simple_multiple_bar + - metric_table: + id: 1812 + title: L2-Fabric (128B read requests per normUnit) + header: + metric: Channel + expr: Expression + metric: + ::_1: + expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/2100_pc_sampling.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/2100_pc_sampling.yaml index d6c4ff393d..e94471d7dc 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/2100_pc_sampling.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/2100_pc_sampling.yaml @@ -1,10 +1,11 @@ ---- +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py Panel Config: id: 2100 title: PC Sampling + metrics_description: {} data source: - - pc_sampling_table: - id: 2101 - title: PC Sampling - source: ps_file - comparable: false # enable it later + - pc_sampling_table: + id: 2101 + title: PC Sampling + source: ps_file + comparable: false diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0000_top_stats.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0000_top_stats.yaml index ccf1309850..55c6f6bb24 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0000_top_stats.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0000_top_stats.yaml @@ -1,14 +1,14 @@ ---- +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py Panel Config: - id: 000 + id: 0 title: Top Stats + metrics_description: {} data source: - - raw_csv_table: - id: 001 - title: Top Kernels - source: pmc_kernel_top.csv - - - raw_csv_table: - id: 002 - title: Dispatch List - source: pmc_dispatch_info.csv + - raw_csv_table: + id: 1 + title: Top Kernels + source: pmc_kernel_top.csv + - raw_csv_table: + id: 2 + title: Dispatch List + source: pmc_dispatch_info.csv diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0100_system_info.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0100_system_info.yaml index b7ec29eaf9..8470ffbbe3 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0100_system_info.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0100_system_info.yaml @@ -1,9 +1,10 @@ ---- +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py Panel Config: id: 100 title: System Info + metrics_description: {} data source: - - raw_csv_table: - id: 101 - source: sysinfo.csv - columnwise: True + - raw_csv_table: + id: 101 + source: sysinfo.csv + columnwise: true diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0200_system-speed-of-light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0200_system-speed-of-light.yaml deleted file mode 100644 index 68687f1c28..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0200_system-speed-of-light.yaml +++ /dev/null @@ -1,262 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - SALU: &SALU_anchor Scalar Arithmetic Logic Unit - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 200 - title: System Speed-of-Light - data source: - - metric_table: - id: 201 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - peak: Peak - pop: Pct of Peak - tips: Tips - metric: - VALU FLOPs: - value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) - + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) - + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + (64 * (((SQ_INSTS_VALU_ADD_F64 - + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (2 * SQ_INSTS_VALU_FMA_F64)))) - / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) / (((($max_sclk - * $cu_per_gpu) * 64) * 2) / 1000)) - tips: - VALU IOPs: - value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - Start_Timestamp))) - unit: GIOP/s - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) - tips: - MFMA FLOPs (F8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - tips: - MFMA FLOPs (BF16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) - tips: - MFMA FLOPs (F16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) - tips: - MFMA FLOPs (F32): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - tips: - MFMA FLOPs (F64): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - tips: - MFMA IOPs (Int8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GIOP/s - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - tips: - Active CUs: - value: $numActiveCUs - unit: CUs - peak: $cu_per_gpu - pop: ((100 * $numActiveCUs) / $cu_per_gpu) - tips: - SALU Utilization: - value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - peak: 100 - pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - tips: - VALU Utilization: - value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - peak: 100 - pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - tips: - MFMA Utilization: - value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu) - * 4))) - unit: pct - peak: 100 - pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu) - * 4))) - tips: - VMEM Utilization: - value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - peak: 100 - pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - tips: - Branch Utilization: - value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - peak: 100 - pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - tips: - VALU Active Threads: - value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - unit: Threads - peak: $wave_size - pop: (100 * AVG((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU / $wave_size) if (SQ_ACTIVE_INST_VALU != 0) else None)) - tips: - IPC: - value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - unit: Instr/cycle - peak: 5 - pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5) - tips: - Wavefront Occupancy: - value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - unit: Wavefronts - peak: ($max_waves_per_cu * $cu_per_gpu) - pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu - * $cu_per_gpu)))) - coll_level: SQ_LEVEL_WAVES - tips: - Theoretical LDS Bandwidth: - value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: (($max_sclk * $cu_per_gpu) * 0.128) - pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * 0.00128))) - tips: - LDS Bank Conflicts/Access: - value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - unit: Conflicts/access - peak: 32 - pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) / 32) - tips: - vL1D Cache Hit Rate: - value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - unit: pct - peak: 100 - pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / - TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - tips: - vL1D Cache BW: - value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu) - pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) - tips: - L2 Cache Hit Rate: - value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - unit: pct - peak: 100 - pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - tips: - L2 Cache BW: - value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)) - pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) - tips: - L2-Fabric Read BW: - value: AVG((128 * TCC_BUBBLE_sum + - 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + - 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)) - unit: GB/s - peak: $hbmBandwidth - pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + - 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + - 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth) - tips: - L2-Fabric Write BW: - value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: $hbmBandwidth - pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth) - tips: - L2-Fabric Read Latency: - value: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - unit: Cycles - peak: None - pop: None - tips: - L2-Fabric Write Latency: - value: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - unit: Cycles - peak: None - pop: None - tips: - sL1D Cache Hit Rate: - value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) - if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) - unit: pct - peak: 100 - pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) - if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) - tips: - sL1D Cache BW: - value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) - unit: GB/s - peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) - pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))) / ((($max_sclk - / 1000) * 64) * $sqc_per_gpu)) - tips: - L1I Hit Rate: - value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) - unit: pct - peak: 100 - pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) - tips: - L1I BW: - value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) - unit: GB/s - peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) - pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))) / ((($max_sclk - / 1000) * 64) * $sqc_per_gpu)) - tips: - L1I Fetch Latency: - value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - unit: Cycles - peak: None - pop: None - coll_level: SQ_IFETCH_LEVEL - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0200_system_speed_of_light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0200_system_speed_of_light.yaml new file mode 100644 index 0000000000..722866f6e0 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0200_system_speed_of_light.yaml @@ -0,0 +1,346 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 200 + title: System Speed-of-Light + metrics_description: + VALU FLOPs: 'The total floating-point operations executed per second on the VALU. + This is also presented as a percent of the peak theoretical FLOPs achievable + on the specific accelerator. Note: this does not include any floating-point + operations from MFMA instructions.' + VALU IOPs: 'The total integer operations executed per second on the VALU. This + is also presented as a percent of the peak theoretical IOPs achievable on the + specific accelerator. Note: this does not include any integer operations from + MFMA instructions.' + MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations + executed per second. This does not include any 16-bit brain floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F8 MFMA operations achievable on the specific accelerator. It is supported on + AMD Instinct MI300 series and later only. + MFMA FLOPs (BF16): 'The total number of 16-bit brain floating point MFMA operations + executed per second. Note: this does not include any 16-bit brain floating point + operations from VALU instructions. This is also presented as a percent of the + peak theoretical BF16 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F16): 'The total number of 16-bit floating point MFMA operations executed + per second. Note: this does not include any 16-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F16 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F32): 'The total number of 32-bit floating point MFMA operations executed + per second. Note: this does not include any 32-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F32 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F64): 'The total number of 64-bit floating point MFMA operations executed + per second. Note: this does not include any 64-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F64 MFMA operations achievable on the specific accelerator.' + MFMA IOPs (Int8): 'The total number of 8-bit integer MFMA operations executed + per second. Note: this does not include any 8-bit integer operations from VALU + instructions. This is also presented as a percent of the peak theoretical INT8 + MFMA operations achievable on the specific accelerator.' + Active CUs: Total number of active compute units (CUs) on the accelerator during + the kernel execution. + SALU Utilization: Indicates what percent of the kernel's duration the SALU was + busy executing instructions. Computed as the ratio of the total number of cycles + spent by the scheduler issuing SALU or SMEM instructions over the total CU cycles. + VALU Utilization: Indicates what percent of the kernel's duration the VALU was + busy executing instructions. Does not include VMEM operations. Computed as the + ratio of the total number of cycles spent by the scheduler issuing VALU instructions + over the total CU cycles. + MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit + was busy executing instructions. Computed as the ratio of the total number of + cycles the MFMA was busy over the total CU cycles. + VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit + was busy executing instructions, including both global/generic and spill/scratch + operations (see the VMEM instruction count metrics) for more detail). Does not + include VALU operations. Computed as the ratio of the total number of cycles + spent by the scheduler issuing VMEM instructions over the total CU cycles. + Branch Utilization: Indicates what percent of the kernel's duration the branch + unit was busy executing instructions. Computed as the ratio of the total number + of cycles spent by the scheduler issuing branch instructions over the total + CU cycles + VALU Active Threads: Indicates the average level of divergence within a wavefront + over the lifetime of the kernel. The number of work-items that were active in + a wavefront during execution of each VALU instruction, time-averaged over all + VALU instructions run on all wavefronts in the kernel. + IPC: The ratio of the total number of instructions executed on the CU over the + total active CU cycles. This is also presented as a percent of the peak theoretical + bandwidth achievable on the specific accelerator. + Wavefront Occupancy: 'The time-averaged number of wavefronts resident on the accelerator + over the lifetime of the kernel. Note: this metric may be inaccurate for short-running + kernels (less than 1ms). This is also presented as a percent of the peak theoretical + occupancy achievable on the specific accelerator.' + Theoretical LDS Bandwidth: Indicates the maximum amount of bytes that could have + been loaded from, stored to, or atomically updated in the LDS per unit time + (see LDS Bandwidth example for more detail). This is also presented as a percent + of the peak theoretical F64 MFMA operations achievable on the specific accelerator. + LDS Bank Conflicts/Access: The ratio of the number of cycles spent in the LDS + scheduler due to bank conflicts (as determined by the conflict resolution hardware) + to the base number of cycles that would be spent in the LDS scheduler in a completely + uncontended case. This is also presented in normalized form (i.e., the Bank + Conflict Rate). + vL1D Cache Hit Rate: The ratio of the number of vL1D cache line requests that + hit in vL1D cache over the total number of cache line requests to the vL1D cache + RAM. + vL1D Cache BW: The number of bytes looked up in the vL1D cache as a result of + VMEM instructions per unit time. The number of bytes is calculated as the number + of cache lines requested multiplied by the cache line size. This value does + not consider partial requests, so e.g., if only a single value is requested + in a cache line, the data movement will still be counted as a full cache line. + This is also presented as a percent of the peak theoretical bandwidth achievable + on the specific accelerator. + L2 Cache Hit Rate: The ratio of the number of L2 cache line requests that hit + in the L2 cache over the total number of incoming cache line requests to the + L2 cache. + L2 Cache BW: The number of bytes looked up in the L2 cache per unit time. The + number of bytes is calculated as the number of cache lines requested multiplied + by the cache line size. This value does not consider partial requests, so e.g., + if only a single value is requested in a cache line, the data movement will + still be counted as a full cache line. This is also presented as a percent of + the peak theoretical bandwidth achievable on the specific accelerator. + L2-Fabric Read BW: "The number of bytes read by the L2 over the Infinity Fabric\u2122\ + \ interface per unit time. This is also presented as a percent of the peak theoretical\ + \ bandwidth achievable on the specific accelerator." + L2-Fabric Write BW: The number of bytes sent by the L2 over the Infinity Fabric + interface by write and atomic operations per unit time. This is also presented + as a percent of the peak theoretical bandwidth achievable on the specific accelerator. + L2-Fabric Read Latency: The time-averaged number of cycles read requests spent + in Infinity Fabric before data was returned to the L2. + L2-Fabric Write Latency: The time-averaged number of cycles write requests spent + in Infinity Fabric before a completion acknowledgement was returned to the L2. + sL1D Cache Hit Rate: The percent of sL1D requests that hit on a previously loaded + line the cache. Calculated as the ratio of the number of sL1D requests that + hit over the number of all sL1D requests. + sL1D Cache BW: The number of bytes looked up in the sL1D cache per unit time. + This is also presented as a percent of the peak theoretical bandwidth achievable + on the specific accelerator. + L1I Hit Rate: The number of bytes looked up in the L1I cache per unit time. This + is also presented as a percent of the peak theoretical bandwidth achievable + on the specific accelerator. + L1I BW: The percent of L1I requests that hit on a previously loaded line the cache. + Calculated as the ratio of the number of L1I requests that hit over the number + of all L1I requests. + L1I Fetch Latency: The average number of cycles spent to fetch instructions to + a CU. + data source: + - metric_table: + id: 201 + title: System Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + peak: Peak + pop: Pct of Peak + metric: + VALU FLOPs: + value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) + / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) + VALU IOPs: + value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp + - Start_Timestamp))) + unit: GIOP/s + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp + - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) + MFMA FLOPs (F8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + MFMA FLOPs (BF16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) + MFMA FLOPs (F16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) + MFMA FLOPs (F32): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + MFMA FLOPs (F64): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + MFMA IOPs (Int8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GIOP/s + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + Active CUs: + value: $numActiveCUs + unit: CUs + peak: $cu_per_gpu + pop: ((100 * $numActiveCUs) / $cu_per_gpu) + SALU Utilization: + value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + VALU Utilization: + value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + MFMA Utilization: + value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu) * 4))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu) * 4))) + VMEM Utilization: + value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + unit: pct + peak: 100 + pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + Branch Utilization: + value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + peak: 100 + pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + VALU Active Threads: + value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + unit: Threads + peak: $wave_size + pop: (100 * AVG((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU / $wave_size) + if (SQ_ACTIVE_INST_VALU != 0) else None)) + IPC: + value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + unit: Instr/cycle + peak: 5 + pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5) + Wavefront Occupancy: + value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + unit: Wavefronts + peak: ($max_waves_per_cu * $cu_per_gpu) + pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu + * $cu_per_gpu)))) + coll_level: SQ_LEVEL_WAVES + Theoretical LDS Bandwidth: + value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: (($max_sclk * $cu_per_gpu) * 0.128) + pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * 0.00128))) + LDS Bank Conflicts/Access: + value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + unit: Conflicts/access + peak: 32 + pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) / 32) + vL1D Cache Hit Rate: + value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None)) + unit: pct + peak: 100 + pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None)) + vL1D Cache BW: + value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu) + pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) + L2 Cache Hit Rate: + value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + unit: pct + peak: 100 + pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + L2 Cache BW: + value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)) + pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) + / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) + L2-Fabric Read BW: + value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum + - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp + - Start_Timestamp)) + unit: GB/s + peak: $hbmBandwidth + pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum + - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp + - Start_Timestamp)))) / $hbmBandwidth) + L2-Fabric Write BW: + value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: $hbmBandwidth + pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - + TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))) / + $hbmBandwidth) + L2-Fabric Read Latency: + value: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + unit: Cycles + peak: None + pop: None + L2-Fabric Write Latency: + value: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: Cycles + peak: None + pop: None + sL1D Cache Hit Rate: + value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) + if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) + unit: pct + peak: 100 + pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) + if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) + sL1D Cache BW: + value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) + unit: GB/s + peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) + pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * + 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) + L1I Hit Rate: + value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) + unit: pct + peak: 100 + pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) + L1I BW: + value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) + unit: GB/s + peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) + pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * + 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) + L1I Fetch Latency: + value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + unit: Cycles + peak: None + pop: None + coll_level: SQ_IFETCH_LEVEL diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0300_mem_chart.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0300_mem_chart.yaml deleted file mode 100644 index 19ca46e63f..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0300_mem_chart.yaml +++ /dev/null @@ -1,316 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 300 - title: Memory Chart - data source: - - metric_table: - id: 301 - title: Memory Chart - header: - metric: Metric - #alias: #alias - value: Value - tips: Tips - metric: - # ---------------------------------------- - # Instr Buff Block - - #TODO: double check wave_occupancy - Wavefront Occupancy: - #alias: wave_occ_ - value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs), 0) - coll_level: SQ_LEVEL_WAVES - tips: - Wave Life: - #alias: wave_life_ - value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0) else 0)), 0) - tips: - - # ---------------------------------------- - # Instr Dispatch Block - SALU: - #alias: salu_ - value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0) - tips: - SMEM: - #alias: smem_ - value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0) - tips: - VALU: - #alias: valu_ - value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0) - tips: - MFMA: - #alias: mfma_ - value: ROUND(AVG((SQ_INSTS_MFMA / $denom)), 0) - tips: - VMEM: - #alias: vmem_ - value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0) - tips: - LDS: - #alias: lds_ - value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) - tips: - GWS: - #alias: gws_ - value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0) - tips: - BR: - #alias: br_ - value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0) - tips: - - # ---------------------------------------- - # Exec Block - Active CUs: - #alias: active_cu_ - value: $numActiveCUs - tips: - Num CUs: - #alias: num_cu_ - value: $cu_per_gpu - tips: - VGPR: - #alias: vgpr_ - value: ROUND(AVG(Arch_VGPR), 0) - tips: - # Todo: add AGPRs - SGPR: - #alias: sgpr_ - value: ROUND(AVG(SGPR), 0) - tips: - LDS Allocation: - #alias: lds_alloc_ - value: ROUND(AVG(LDS_Per_Workgroup), 0) - tips: - Scratch Allocation: - #alias: scratch_alloc_ - value: ROUND(AVG(Scratch_Per_Workitem), 0) - tips: - Wavefronts: - #alias: wavefronts_ - value: ROUND(AVG(SPI_CSN_WAVE), 0) - tips: - Workgroups: - #alias: workgroups_ - value: ROUND(AVG(SPI_CSN_NUM_THREADGROUPS), 0) - tips: - - # ---------------------------------------- - # LDS Block - LDS Req: - #alias: lds_req_ - value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) - tips: - LDS Util: - #alias: lds_util_ - value: - ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))), - 0) - tips: - LDS Latency: - #alias: lds_lat - value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)),0) - coll_level: SQ_INST_LEVEL_LDS - tips: - - # ---------------------------------------- - # Vector L1 Cache Block - VL1 Rd: - #alias: vl1_rd_ - value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0) - tips: - VL1 Wr: - #alias: vl1_wr_ - value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0) - tips: - VL1 Atomic: - #alias: vl1_atom_ - value: - ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)), 0) - tips: - - VL1 Hit: - #alias: vl1_hit_ - value: - ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None )), 0) - tips: - VL1 Lat: - #alias: vl1_lat_ - value: - ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum - != 0) else None)), 0) - tips: - VL1 Coalesce: - #alias: vl1_coales_ - value: - ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum - * 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0) - tips: - VL1 Stall: - #alias: vl1_stall_ - value: - ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None)), 0) - tips: - - VL1_L2 Rd: - #alias: vl1_l2_rd_ - value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0) - tips: - VL1_L2 Wr: - #alias: vl1_l2_wr_ - value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0) - tips: - VL1_L2 Atomic: - #alias: vl1_l2_atom_ - value: - ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)), 0) - tips: - - # ---------------------------------------- - # Scalar L1D Cache Block - VL1D Rd: - #alias: sl1_rd_ - value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0) - tips: - VL1D Hit: - #alias: sl1_hit_ - value: - ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ != - 0) else None)) * 100), 0) - tips: - VL1D Lat: - #alias: sl1_lat_ - value: - ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ != - 0) else None)) * 100), 0) - coll_level: SQC_DCACHE_INFLIGHT_LEVEL - tips: - - VL1D_L2 Rd: - #alias: sl1_l2_rd_ - value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0) - tips: - VL1D_L2 Wr: - #alias: sl1_l2_wr_ - value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0) - tips: - VL1D_L2 Atomic: - #alias: sl1_l2_atom_ - value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0) - tips: - - # ---------------------------------------- - # Instr L1 Cache Block - IL1 Fetch: - #alias: il1_fetch_ - value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0) - tips: - IL1 Hit: - #alias: il1_hit_ - value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0) - tips: - IL1 Lat: - #alias: il1_lat_ - value: - ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ != - 0) else None)) * 100), 0) - tips: # ??? coll_level: SQ_IFETCH_LEVEL - IL1_L2 Rd: - #alias: il1_l2_req_ - value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0) - tips: - - # ---------------------------------------- - # L2 Cache Block(inside) - L2 Rd: - #alias: l2_rd_ - value: ROUND(AVG((TCC_READ_sum / $denom)), 0) - tips: - L2 Wr: - #alias: l2_wr_ - value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0) - tips: - L2 Atomic: - #alias: l2_atom_ - value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0) - tips: - L2 Hit: - #alias: l2_hit_ - value: - ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else 0)), 0) - tips: - L2 Rd Lat: - #alias: l2_rd_lat_ - value: - # ROUND(AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) - # if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else None)), - # 0) - tips: - L2 Wr Lat: - #alias: l2_wr_lat_ - value: - # ROUND(AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + - # TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - # != 0) else None)), 0) - tips: - - # ---------------------------------------- - # Fabric Block - Fabric_L2 Rd: - #alias: l2_fabric_rd_ - value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0) - tips: - Fabric_L2 Wr: - #alias: l2_fabric_wr_ - value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0) - tips: - Fabric_L2 Atomic: - #alias: l2_fabric_atom_ - value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0) - tips: - - Fabric Rd Lat: - #alias: fabric_rd_lat_ - value: - ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else 0)), 0) - tips: - Fabric Wr Lat: - #alias: fabric_wr_lat_ - value: - ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else 0)), 0) - tips: - Fabric Atomic Lat: - #alias: fabric_atom_lat_ - value: - ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum - != 0) else 0)), 0) - tips: - - HBM Rd: - #alias: hbm_rd_ - value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0) - tips: - HBM Wr: - #alias: hbm_wr_ - value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0) - tips: - - comparable: false # for now - cli_style: mem_chart - tui_style: mem_chart diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0300_memory_chart.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0300_memory_chart.yaml new file mode 100644 index 0000000000..1a6587ce82 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0300_memory_chart.yaml @@ -0,0 +1,263 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 300 + title: Memory Chart + metrics_description: + Wavefront Occupancy: Wavefronts per active CU. + Wave Life: Average number of cycles executing a wave. + SALU: Total Number of SALU (Scalar ALU) instructions issued per normalization + unit. + SMEM: Total number of SMEM (Scalar Memory Read) instructions issued normalization + unit. + VALU: The number of VALU (Vector ALU) instructions issued per normalization unit. + MFMA: Total number of MFMA (Matrix-Fused-Multiply-Add) instructions issued per + normalization unit. + VMEM: The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch + memory) per normalization unit. + LDS: The total number of LDS instructions (including, but not limited to, read/write/atomics + and HIP's __shfl instructions) executed per normalization unit. + GWS: Total number of GDS (global data sync) instructions issued per normalization + unit. + BR: Total number of BRANCH instructions issued per normalization unit. + Active CUs: Total number of active compute units (CUs) on the accelerator during + the kernel execution. + Num CUs: Total number of compute units (CUs) on the accelerator. + VGPR: 'The number of architected vector general-purpose registers allocated for + the kernel, see VALU. Note: this may not exactly match the number of VGPRs requested + by the compiler due to allocation granularity.' + SGPR: 'The number of scalar general-purpose registers allocated for the kernel, + see SALU. Note: this may not exactly match the number of SGPRs requested by + the compiler due to allocation granularity.' + LDS Allocation: 'The number of bytes of LDS memory (or, shared memory) allocated + for this kernel. Note: This may also be larger than what was requested at compile + time due to both allocation granularity and dynamic per-dispatch LDS allocations.' + Scratch Allocation: The number of bytes of scratch memory requested per work-item + for this kernel. Scratch memory is used for stack memory on the accelerator, + as well as for register spills and restores. + Wavefronts: The total number of wavefronts, summed over all workgroups, forming + this kernel launch. + Workgroups: The total number of workgroups forming this kernel launch. + LDS Req: The total number of LDS instructions (including, but not limited to, + read/write/atomics and HIP's __shfl instructions) executed per normalization + unit. + LDS Util: Indicates what percent of the kernel's duration the LDS was actively + executing instructions (including, but not limited to, load, store, atomic and + HIP's __shfl operations). Calculated as the ratio of the total number of cycles + LDS was active over the total CU cycles. + LDS Latency: The average number of round-trip cycles (i.e., from issue to data-return + / acknowledgment) required for an LDS instruction to complete. + VL1 Rd: The total number of incoming read requests from the address processing + unit after coalescing per normalization unit + VL1 Wr: The total number of incoming write requests from the address processing + unit after coalescing per normalization unit + VL1 Atomic: The total number of incoming atomic requests from the address processing + unit after coalescing per normalization unit + VL1 Hit: The ratio of the number of vL1D cache line requests that hit in vL1D + cache over the total number of cache line requests to the vL1D Cache RAM. + VL1 Lat: Calculated as the average number of cycles that a vL1D cache line request + spent in the vL1D cache pipeline. + VL1 Coalesce: Indicates how well memory instructions were coalesced by the address + processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). Calculated + as the average number of thread-requests generated per instruction divided by + the ideal number of thread-requests per instruction. + VL1 Stall: The ratio of the number of cycles where the vL1D is stalled waiting + to issue a request for data to the L2 cache divided by the number of cycles + where the vL1D is active. + VL1_L2 Rd: The number of read requests for a vL1D cache line that were not satisfied + by the vL1D and must be retrieved from the to the L2 Cache per normalization + unit. + VL1_L2 Wr: The number of write requests to a vL1D cache line that were sent through + the vL1D to the L2 cache, per normalization unit. + VL1_L2 Atomic: The number of atomic requests that are sent through the vL1D to + the L2 cache, per normalization unit. This includes requests for atomics with, + and without return. + sL1D Rd: The total number of requests, of any size or type, made to the sL1D per + normalization unit. + sL1D Hit: The total number of sL1D requests that hit on a previously loaded cache + line, per normalization unit. + sL1D_L2 Rd: The total number of read requests from sL1D to the L2, per normalization + unit. + sL1D_L2 Wr: The total number of write requests from sL1D to the L2, per normalization + unit. Typically unused on current CDNA accelerators. + sL1D_L2 Atomic: The total number of atomic requests from sL1D to the L2, per normalization + unit. Typically unused on current CDNA accelerators. + IL1 Fetch: The total number of requests made to the L1I per normalization-unit. + IL1 Hit: The percent of L1I requests that hit on a previously loaded line the + cache. Calculated as the ratio of the number of L1I requests that hit over the + number of all L1I requests. + IL1 Lat: The average number of cycles spent to fetch instructions to a CU. + IL1_L2 Rd: The total number of requests across the L1I - L2 interface per normalization-unit. + L2 Rd: The total number of read requests to the L2 from all clients. + L2 Wr: The total number of write requests to the L2 from all clients. + L2 Atomic: The total number of atomic requests (with and without return) to the + L2 from all clients. + L2 Hit: The ratio of the number of L2 cache line requests that hit in the L2 cache + over the total number of incoming cache line requests to the L2 cache. + L2 Rd Lat: Calculated as the average number of cycles that the vL1D cache took + to issue and receive read requests from the L2 Cache. This number also includes + requests for atomics with return values. + L2 Wr Lat: Calculated as the average number of cycles that the vL1D cache took + to issue and receive acknowledgement of a write request to the L2 Cache. This + number also includes requests for atomics without return values. + Fabric_L2 Rd: Number of L2 cache - Infinity Fabric read requests (either 32-byte + or 64-byte) summed over TCC instances per normalization unit. + Fabric_L2 Wr: Number of L2 cache - Infinity Fabric write requests (either 32-byte + or 64-byte) summed over TCC instances per normalization unit. + Fabric_L2 Atomic: Number of L2 cache - Infinity Fabric write requests (either + 32-byte or 64-byte) that are actually atomic requests summed over TCC instances + per normalization unit. + Fabric Rd Lat: The time-averaged number of cycles read requests spent in Infinity + Fabric before data was returned to the L2. + Fabric Wr Lat: The time-averaged number of cycles write requests spent in Infinity + Fabric before a completion acknowledgement was returned to the L2. + Fabric Atomic Lat: The time-averaged number of cycles atomic requests spent in + Infinity Fabric before a completion acknowledgement (atomic without return value) + or data (atomic with return value) was returned to the L2. + HBM Rd: The total number of L2 requests to Infinity Fabric to read 32B or 64B + of data from the accelerator's local HBM, per normalization unit. + HBM Wr: 'The total number of L2 requests to Infinity Fabric to write or atomically + update 32B or 64B of data in the accelerator''s local HBM, per normalization + unit. ' + data source: + - metric_table: + id: 301 + title: Memory Chart + header: + metric: Metric + value: Value + metric: + Wavefront Occupancy: + value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs), + 0) + coll_level: SQ_LEVEL_WAVES + Wave Life: + value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0) else + 0)), 0) + SALU: + value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0) + SMEM: + value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0) + VALU: + value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0) + MFMA: + value: ROUND(AVG((SQ_INSTS_MFMA / $denom)), 0) + VMEM: + value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0) + LDS: + value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) + GWS: + value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0) + BR: + value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0) + Active CUs: + value: $numActiveCUs + Num CUs: + value: $cu_per_gpu + VGPR: + value: ROUND(AVG(Arch_VGPR), 0) + SGPR: + value: ROUND(AVG(SGPR), 0) + LDS Allocation: + value: ROUND(AVG(LDS_Per_Workgroup), 0) + Scratch Allocation: + value: ROUND(AVG(Scratch_Per_Workitem), 0) + Wavefronts: + value: ROUND(AVG(SPI_CSN_WAVE), 0) + Workgroups: + value: ROUND(AVG(SPI_CSN_NUM_THREADGROUPS), 0) + LDS Req: + value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) + LDS Util: + value: ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))), 0) + LDS Latency: + value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS + != 0) else None)),0) + coll_level: SQ_INST_LEVEL_LDS + VL1 Rd: + value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0) + VL1 Wr: + value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0) + VL1 Atomic: + value: ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)), 0) + VL1 Hit: + value: ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None )), 0) + VL1 Lat: + value: ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum + != 0) else None)), 0) + VL1 Coalesce: + value: ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum + * 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0) + VL1 Stall: + value: ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None)), 0) + VL1_L2 Rd: + value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0) + VL1_L2 Wr: + value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0) + VL1_L2 Atomic: + value: ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)), 0) + sL1D Rd: + value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0) + sL1D Hit: + value: ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ + != 0) else None)) * 100), 0) + sL1D Lat: + value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ + != 0) else None)) * 100), 0) + coll_level: SQC_DCACHE_INFLIGHT_LEVEL + sL1D_L2 Rd: + value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0) + sL1D_L2 Wr: + value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0) + sL1D_L2 Atomic: + value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0) + IL1 Fetch: + value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0) + IL1 Hit: + value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0) + IL1 Lat: + value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ + != 0) else None)) * 100), 0) + IL1_L2 Rd: + value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0) + L2 Rd: + value: ROUND(AVG((TCC_READ_sum / $denom)), 0) + L2 Wr: + value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0) + L2 Atomic: + value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0) + L2 Hit: + value: ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if + ((TCC_HIT_sum + TCC_MISS_sum) != 0) else 0)), 0) + L2 Rd Lat: + value: null + L2 Wr Lat: + value: null + Fabric_L2 Rd: + value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0) + Fabric_L2 Wr: + value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0) + Fabric_L2 Atomic: + value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0) + Fabric Rd Lat: + value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else 0)), 0) + Fabric Wr Lat: + value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else 0)), 0) + Fabric Atomic Lat: + value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else 0)), 0) + HBM Rd: + value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0) + HBM Wr: + value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0) + comparable: false + cli_style: mem_chart + tui_style: mem_chart diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0400_roofline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0400_roofline.yaml new file mode 100644 index 0000000000..41c8bac547 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0400_roofline.yaml @@ -0,0 +1,9 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 400 + title: Roofline + metrics_description: {} + data source: + - None: + id: 401 + title: Roofline diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0400_roofline_info.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0400_roofline_info.yaml deleted file mode 100644 index 1474b85cf2..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0400_roofline_info.yaml +++ /dev/null @@ -1,8 +0,0 @@ ---- -Panel Config: - id: 400 - title: Roofline - data source: - - None: - id: 401 - title: Roofline \ No newline at end of file diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0500_command-processor.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0500_command-processor.yaml deleted file mode 100644 index 164b3552bf..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0500_command-processor.yaml +++ /dev/null @@ -1,135 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 500 - title: Command Processor (CPC/CPF) - data source: - - metric_table: - id: 501 - title: Command Processor Fetcher - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - CPF Utilization: - avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) - if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) - min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) - if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) - max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) - if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) - unit: pct - tips: - CPF Stall: - avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None)) - min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None)) - max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None)) - unit: pct - tips: - CPF-L2 Utilization: - avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) - if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) - min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) - if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) - max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) - if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) - unit: pct - tips: - CPF-L2 Stall: - avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY - != 0) else None)) - min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY - != 0) else None)) - max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY - != 0) else None)) - unit: pct - tips: - CPF-UTCL1 Stall: - avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None) - min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None) - max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None) - unit: pct - tips: - - - metric_table: - id: 502 - title: Packet Processor - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - CPC Utilization: - avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) - if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) - min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) - if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) - max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) - if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) - unit: pct - tips: - CPC Stall Rate: - avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - unit: pct - tips: - CPC Packet Decoding Utilization: - avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) - min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) - max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) - unit: pct - tips: - CPC-Workgroup Manager Utilization: - avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) - min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) - max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) - unit: Pct - tips: - CPC-L2 Utilization: - avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) - if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) - min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) - if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) - max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) - if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) - unit: pct - tips: - CPC-UTCL1 Stall: - avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None) - min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None) - max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None) - unit: pct - tips: - CPC-UTCL2 Utilization: - avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) - if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) - min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) - if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) - max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) - if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) - unit: pct - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0500_command_processor_cpc_cpf.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0500_command_processor_cpc_cpf.yaml new file mode 100644 index 0000000000..c4d2cabf52 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0500_command_processor_cpc_cpf.yaml @@ -0,0 +1,145 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 500 + title: Command Processor (CPC/CPF) + metrics_description: + CPF Utilization: Percent of total cycles where the CPF was busy actively doing + any work. The ratio of CPF busy cycles over total cycles counted by the CPF. + CPF Stall: Percent of CPF busy cycles where the CPF was stalled for any reason. + CPF-L2 Utilization: Percent of total cycles counted by the CPF-L2 interface where + the CPF-L2 interface was active doing any work. The ratio of CPF-L2 busy cycles + over total cycles counted by the CPF-L2. + CPF-L2 Stall: Percent of CPF-L2 L2 busy cycles where the CPF-L2 interface was + stalled for any reason. + CPF-UTCL1 Stall: Percent of CPF busy cycles where the CPF was stalled by address + translation. + CPC Utilization: Percent of total cycles where the CPC was busy actively doing + any work. The ratio of CPC busy cycles over total cycles counted by the CPC. + CPC Stall Rate: Percent of CPC busy cycles where the CPC was stalled for any reason. + CPC Packet Decoding Utilization: Percent of CPC busy cycles spent decoding commands + for processing. + CPC-Workgroup Manager Utilization: Percent of CPC busy cycles spent dispatching + workgroups to the workgroup manager. + CPC-L2 Utilization: Percent of total cycles counted by the CPC-L2 interface where + the CPC-L2 interface was active doing any work. + CPC-UTCL1 Stall: Percent of CPC busy cycles where the CPC was stalled by address + translation + CPC-UTCL2 Utilization: 'Percent of total cycles counted by the CPC''s L2 address + translation interface where the CPC was busy doing address translation work. ' + data source: + - metric_table: + id: 501 + title: Command processor fetcher (CPF) + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + CPF Utilization: + avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) + if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) + min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) + if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) + max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) + if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) + unit: pct + CPF Stall: + avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None)) + min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None)) + max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None)) + unit: pct + CPF-L2 Utilization: + avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) + if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) + min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) + if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) + max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) + if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) + unit: pct + CPF-L2 Stall: + avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY + != 0) else None)) + min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY + != 0) else None)) + max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY + != 0) else None)) + unit: pct + CPF-UTCL1 Stall: + avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) + if (CPF_CPF_STAT_BUSY != 0) else None) + min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) + if (CPF_CPF_STAT_BUSY != 0) else None) + max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) + if (CPF_CPF_STAT_BUSY != 0) else None) + unit: pct + - metric_table: + id: 502 + title: Command processor packet processor (CPC) + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + CPC Utilization: + avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) + if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) + min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) + if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) + max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) + if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) + unit: pct + CPC Stall Rate: + avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + unit: pct + CPC Packet Decoding Utilization: + avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + unit: pct + CPC-Workgroup Manager Utilization: + avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + unit: Pct + CPC-L2 Utilization: + avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) + if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) + min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) + if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) + max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) + if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) + unit: pct + CPC-UTCL1 Stall: + avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if + (CPC_CPC_STAT_BUSY != 0) else None) + min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if + (CPC_CPC_STAT_BUSY != 0) else None) + max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if + (CPC_CPC_STAT_BUSY != 0) else None) + unit: pct + CPC-UTCL2 Utilization: + avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) + if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) + min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) + if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) + max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) + if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) + unit: pct diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0600_shader-processor-input.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0600_shader-processor-input.yaml deleted file mode 100644 index c78c3645a0..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0600_shader-processor-input.yaml +++ /dev/null @@ -1,167 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 600 - title: Workgroup Manager (SPI) - data source: - - metric_table: - id: 601 - title: Workgroup Manager Utilizations - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Accelerator Utilization: - avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) - min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) - max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) - unit: Pct - tips: - Scheduler-Pipe Utilization: - avg: AVG(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - min: MIN(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - max: MAX(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - unit: Pct - tips: - Workgroup Manager Utilization: - avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) - min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) - max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) - unit: Pct - tips: - Shader Engine Utilization: - avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) - min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) - max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) - unit: Pct - tips: - SIMD Utilization: - avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Dispatched Workgroups: - avg: AVG(SPI_CSN_NUM_THREADGROUPS) - min: MIN(SPI_CSN_NUM_THREADGROUPS) - max: MAX(SPI_CSN_NUM_THREADGROUPS) - unit: Workgroups - tips: - Dispatched Wavefronts: - avg: AVG(SPI_CSN_WAVE) - min: MIN(SPI_CSN_WAVE) - max: MAX(SPI_CSN_WAVE) - unit: Wavefronts - tips: - VGPR Writes: - avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - unit: Cycles/wave - tips: - SGPR Writes: - avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else - None)) - unit: Cycles/wave - tips: - - metric_table: - id: 602 - title: Workgroup Manager - Resource Allocation - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Not-scheduled Rate (Workgroup Manager): - avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None) - min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None) - max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None) - unit: Pct - tips: - Not-scheduled Rate (Scheduler-Pipe): - avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None) - min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None) - max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None) - unit: Pct - tips: - Scheduler-Pipe Stall Rate: - avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None)) - min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None)) - max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None)) - unit: Pct - tips: - Scratch Stall Rate: - avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - unit: Pct - tips: - Insufficient SIMD Waveslots: - avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Insufficient SIMD VGPRs: - avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Insufficient SIMD SGPRs: - avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Insufficient CU LDS: - avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Insufficient CU Barriers: - avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Reached CU Workgroup Limit: - avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Reached CU Wavefront Limit: - avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0600_workgroup_manager_spi.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0600_workgroup_manager_spi.yaml new file mode 100644 index 0000000000..f6bf13d8b8 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0600_workgroup_manager_spi.yaml @@ -0,0 +1,201 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 600 + title: Workgroup Manager (SPI) + metrics_description: + Accelerator Utilization: The percent of cycles in the kernel where the accelerator + was actively doing any work. + Scheduler-Pipe Utilization: The percent of total scheduler-pipe cycles in the + kernel where the scheduler-pipes were actively doing any work. + Workgroup Manager Utilization: The percent of cycles in the kernel where the workgroup + manager was actively doing any work. + Shader Engine Utilization: The percent of total shader engine cycles in the kernel + where any CU in a shader-engine was actively doing any work, normalized over + all shader-engines. Low values (e.g., << 100%) indicate that the accelerator + was not fully saturated by the kernel, or a potential load-imbalance issue. + SIMD Utilization: The percent of total SIMD cycles in the kernel where any SIMD + on a CU was actively doing any work, summed over all CUs. Low values (less than + 100%) indicate that the accelerator was not fully saturated by the kernel, or + a potential load-imbalance issue. + Dispatched Workgroups: The total number of workgroups forming this kernel launch. + Dispatched Wavefronts: The total number of wavefronts, summed over all workgroups, + forming this kernel launch. + VGPR Writes: The average number of cycles spent initializing VGPRs at wave creation. + SGPR Writes: The average number of cycles spent initializing SGPRs at wave creation. + Not-scheduled Rate (Workgroup Manager): The percent of total scheduler-pipe cycles + in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck + within the workgroup manager rather than a lack of a CU or SIMD with sufficient + resources. + Not-scheduled Rate (Scheduler-Pipe): 'The percent of total scheduler-pipe cycles + in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck + within the scheduler-pipes rather than a lack of a CU or SIMD with sufficient + resources. ' + Scheduler-Pipe Stall Rate: The percent of total scheduler-pipe cycles in the kernel + where a workgroup could not be scheduled to a CU due to occupancy limitations + (like a lack of a CU or SIMD with sufficient resources). + Scratch Stall Rate: The percent of total shader-engine cycles in the kernel where + a workgroup could not be scheduled to a CU due to lack of private (a.k.a., scratch) + memory slots. While this can reach up to 100%, note that the actual occupancy + limitations on a kernel using private memory are typically quite small (for + example, less than 1% of the total number of waves that can be scheduled to + an accelerator). + Insufficient SIMD Waveslots: The percent of total SIMD cycles in the kernel where + a workgroup could not be scheduled to a SIMD due to lack of available waveslots. + Insufficient SIMD VGPRs: The percent of total SIMD cycles in the kernel where + a workgroup could not be scheduled to a SIMD due to lack of available VGPRs. + Insufficient SIMD SGPRs: The percent of total SIMD cycles in the kernel where + a workgroup could not be scheduled to a SIMD due to lack of available SGPRs. + Insufficient CU LDS: The percent of total CU cycles in the kernel where a workgroup + could not be scheduled to a CU due to lack of available LDS. + Insufficient CU Barriers: The percent of total CU cycles in the kernel where a + workgroup could not be scheduled to a CU due to lack of available barriers. + Reached CU Workgroup Limit: The percent of total CU cycles in the kernel where + a workgroup could not be scheduled to a CU due to limits within the workgroup + manager. This is expected to be always be zero on CDNA2 or newer accelerators + (and small for previous accelerators). + Reached CU Wavefront Limit: The percent of total CU cycles in the kernel where + a wavefront could not be scheduled to a CU due to limits within the workgroup + manager. This is expected to be always be zero on CDNA2 or newer accelerators + (and small for previous accelerators). + data source: + - metric_table: + id: 601 + title: Workgroup manager utilizations + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Accelerator Utilization: + avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) + min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) + max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) + unit: Pct + Scheduler-Pipe Utilization: + avg: AVG(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu + * $se_per_gpu)) + min: MIN(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu + * $se_per_gpu)) + max: MAX(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu + * $se_per_gpu)) + unit: Pct + Workgroup Manager Utilization: + avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) + min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) + max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) + unit: Pct + Shader Engine Utilization: + avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) + min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) + max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) + unit: Pct + SIMD Utilization: + avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Dispatched Workgroups: + avg: AVG(SPI_CSN_NUM_THREADGROUPS) + min: MIN(SPI_CSN_NUM_THREADGROUPS) + max: MAX(SPI_CSN_NUM_THREADGROUPS) + unit: Workgroups + Dispatched Wavefronts: + avg: AVG(SPI_CSN_WAVE) + min: MIN(SPI_CSN_WAVE) + max: MAX(SPI_CSN_WAVE) + unit: Wavefronts + VGPR Writes: + avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + unit: Cycles/wave + SGPR Writes: + avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else + None)) + unit: Cycles/wave + - metric_table: + id: 602 + title: Workgroup Manager - Resource Allocation + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Not-scheduled Rate (Workgroup Manager): + avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + unit: Pct + Not-scheduled Rate (Scheduler-Pipe): + avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + unit: Pct + Scheduler-Pipe Stall Rate: + avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) + min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) + max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) + unit: Pct + Scratch Stall Rate: + avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + unit: Pct + Insufficient SIMD Waveslots: + avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Insufficient SIMD VGPRs: + avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Insufficient SIMD SGPRs: + avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Insufficient CU LDS: + avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Insufficient CU Barriers: + avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Reached CU Workgroup Limit: + avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Reached CU Wavefront Limit: + avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0700_wavefront-launch.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0700_wavefront-launch.yaml deleted file mode 100644 index cc650e9bc0..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0700_wavefront-launch.yaml +++ /dev/null @@ -1,142 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 700 - title: Wavefront - data source: - - metric_table: - id: 701 - title: Wavefront Launch Stats - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Grid Size: - avg: AVG(Grid_Size) - min: MIN(Grid_Size) - max: MAX(Grid_Size) - unit: Work Items - tips: - Workgroup Size: - avg: AVG(Workgroup_Size) - min: MIN(Workgroup_Size) - max: MAX(Workgroup_Size) - unit: Work Items - tips: - Total Wavefronts: - avg: AVG(SPI_CSN_WAVE) - min: MIN(SPI_CSN_WAVE) - max: MAX(SPI_CSN_WAVE) - unit: Wavefronts - tips: - Saved Wavefronts: - avg: AVG(SQ_WAVES_SAVED) - min: MIN(SQ_WAVES_SAVED) - max: MAX(SQ_WAVES_SAVED) - unit: Wavefronts - tips: - Restored Wavefronts: - avg: AVG(SQ_WAVES_RESTORED) - min: MIN(SQ_WAVES_RESTORED) - max: MAX(SQ_WAVES_RESTORED) - unit: Wavefronts - tips: - VGPRs: - avg: AVG(Arch_VGPR) - min: MIN(Arch_VGPR) - max: MAX(Arch_VGPR) - unit: Registers - tips: - AGPRs: - avg: AVG(Accum_VGPR) - min: MIN(Accum_VGPR) - max: MAX(Accum_VGPR) - unit: Registers - tips: - SGPRs: - avg: AVG(SGPR) - min: MIN(SGPR) - max: MAX(SGPR) - unit: Registers - tips: - LDS Allocation: - avg: AVG(LDS_Per_Workgroup) - min: MIN(LDS_Per_Workgroup) - max: MAX(LDS_Per_Workgroup) - unit: Bytes - tips: - Scratch Allocation: - avg: AVG(Scratch_Per_Workitem) - min: MIN(Scratch_Per_Workitem) - max: MAX(Scratch_Per_Workitem) - unit: Bytes/Workitem - tips: - - - metric_table: - id: 702 - title: Wavefront Runtime Stats - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Kernel Time: - avg: AVG((End_Timestamp - Start_Timestamp)) - min: MIN((End_Timestamp - Start_Timestamp)) - max: MAX((End_Timestamp - Start_Timestamp)) - unit: ns - tips: - Kernel Time (Cycles): - avg: AVG($GRBM_GUI_ACTIVE_PER_XCD) - min: MIN($GRBM_GUI_ACTIVE_PER_XCD) - max: MAX($GRBM_GUI_ACTIVE_PER_XCD) - unit: Cycle - tips: - Instructions per wavefront: - avg: AVG((SQ_INSTS / SQ_WAVES)) - min: MIN((SQ_INSTS / SQ_WAVES)) - max: MAX((SQ_INSTS / SQ_WAVES)) - unit: Instr/wavefront - tips: - Wave Cycles: - avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom)) - min: MIN(((4 * SQ_WAVE_CYCLES) / $denom)) - max: MAX(((4 * SQ_WAVE_CYCLES) / $denom)) - unit: (Cycles + $normUnit) - tips: - Dependency Wait Cycles: - avg: AVG(((4 * SQ_WAIT_ANY) / $denom)) - min: MIN(((4 * SQ_WAIT_ANY) / $denom)) - max: MAX(((4 * SQ_WAIT_ANY) / $denom)) - unit: (Cycles + $normUnit) - tips: - Issue Wait Cycles: - avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom)) - min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom)) - max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom)) - unit: (Cycles + $normUnit) - tips: - Active Cycles: - avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom)) - min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom)) - max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom)) - unit: (Cycles + $normUnit) - tips: - Wavefront Occupancy: - avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - unit: Wavefronts - coll_level: SQ_LEVEL_WAVES - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0700_wavefront.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0700_wavefront.yaml new file mode 100644 index 0000000000..5e332c0b8f --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0700_wavefront.yaml @@ -0,0 +1,173 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 700 + title: Wavefront + metrics_description: + Grid Size: The total number of work-items (or, threads) launched as a part of + the kernel dispatch. In HIP, this is equivalent to the total grid size multiplied + by the total workgroup (or, block) size. + Workgroup Size: The total number of work-items (or, threads) in each workgroup + (or, block) launched as part of the kernel dispatch. In HIP, this is equivalent + to the total block size. + Total Wavefronts: "The total number of wavefronts launched as part of the kernel\ + \ dispatch. On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs,\ + \ the wavefront size is always 64 work-items. Thus, the total number of wavefronts\ + \ should be equivalent to the ceiling of grid size divided by 64." + Saved Wavefronts: The total number of wavefronts saved at a context-save. + Restored Wavefronts: The total number of wavefronts restored from a context-save. + VGPRs: 'The number of architected vector general-purpose registers allocated for + the kernel, see VALU. Note: this may not exactly match the number of VGPRs requested + by the compiler due to allocation granularity.' + AGPRs: 'The number of accumulation vector general-purpose registers allocated + for the kernel, see AGPRs. Note: this may not exactly match the number of AGPRs + requested by the compiler due to allocation granularity.' + SGPRs: 'The number of scalar general-purpose registers allocated for the kernel, + see SALU. Note: this may not exactly match the number of SGPRs requested by + the compiler due to allocation granularity.' + LDS Allocation: 'The number of bytes of LDS memory (or, shared memory) allocated + for this kernel. Note: This may also be larger than what was requested at compile + time due to both allocation granularity and dynamic per-dispatch LDS allocations.' + Scratch Allocation: The number of bytes of scratch memory requested per work-item + for this kernel. Scratch memory is used for stack memory on the accelerator, + as well as for register spills and restores. + Kernel Time: The total duration of the executed kernel. + Kernel Time (Cycles): The total duration of the executed kernel in cycles. + Instructions per wavefront: The average number of instructions (of all types) + executed per wavefront. This is averaged over all wavefronts in a kernel dispatch. + Wave Cycles: The number of cycles a wavefront in the kernel dispatch spent resident + on a compute unit per normalization unit. This is averaged over all wavefronts + in a kernel dispatch. + Dependency Wait Cycles: The number of cycles a wavefront in the kernel dispatch + spent resident on a compute unit per normalization unit. This is averaged over + all wavefronts in a kernel dispatch. + Issue Wait Cycles: The number of cycles a wavefront in the kernel dispatch was + unable to issue an instruction for any reason (e.g., execution pipe back-pressure, + arbitration loss, etc.) per normalization unit. This counter is incremented + at every cycle by all wavefronts on a CU unable to issue an instruction. As + such, it is most useful to get a sense of how waves were spending their time, + rather than identification of a precise limiter because another wave could be + actively executing while a wave is issue stalled. The sum of this metric, Dependency + Wait Cycles and Active Cycles should be equal to the total Wave Cycles metric. + Active Cycles: The average number of cycles a wavefront in the kernel dispatch + was actively executing instructions per normalization unit. This measurement + is made on a per-wavefront basis, and may include cycles that another wavefront + spent actively executing (on another execution unit, for example) or was stalled. + As such, it is most useful to get a sense of how waves were spending their time, + rather than identification of a precise limiter. The sum of this metric, Issue + Wait Cycles and Active Wait Cycles should be equal to the total Wave Cycles + metric. + Wavefront Occupancy: 'The time-averaged number of wavefronts resident on the accelerator + over the lifetime of the kernel. Note: this metric may be inaccurate for short-running + kernels (less than 1ms).' + data source: + - metric_table: + id: 701 + title: Wavefront Launch Stats + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Grid Size: + avg: AVG(Grid_Size) + min: MIN(Grid_Size) + max: MAX(Grid_Size) + unit: Work Items + Workgroup Size: + avg: AVG(Workgroup_Size) + min: MIN(Workgroup_Size) + max: MAX(Workgroup_Size) + unit: Work Items + Total Wavefronts: + avg: AVG(SPI_CSN_WAVE) + min: MIN(SPI_CSN_WAVE) + max: MAX(SPI_CSN_WAVE) + unit: Wavefronts + Saved Wavefronts: + avg: AVG(SQ_WAVES_SAVED) + min: MIN(SQ_WAVES_SAVED) + max: MAX(SQ_WAVES_SAVED) + unit: Wavefronts + Restored Wavefronts: + avg: AVG(SQ_WAVES_RESTORED) + min: MIN(SQ_WAVES_RESTORED) + max: MAX(SQ_WAVES_RESTORED) + unit: Wavefronts + VGPRs: + avg: AVG(Arch_VGPR) + min: MIN(Arch_VGPR) + max: MAX(Arch_VGPR) + unit: Registers + AGPRs: + avg: AVG(Accum_VGPR) + min: MIN(Accum_VGPR) + max: MAX(Accum_VGPR) + unit: Registers + SGPRs: + avg: AVG(SGPR) + min: MIN(SGPR) + max: MAX(SGPR) + unit: Registers + LDS Allocation: + avg: AVG(LDS_Per_Workgroup) + min: MIN(LDS_Per_Workgroup) + max: MAX(LDS_Per_Workgroup) + unit: Bytes + Scratch Allocation: + avg: AVG(Scratch_Per_Workitem) + min: MIN(Scratch_Per_Workitem) + max: MAX(Scratch_Per_Workitem) + unit: Bytes/Workitem + - metric_table: + id: 702 + title: Wavefront Runtime Stats + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Kernel Time: + avg: AVG((End_Timestamp - Start_Timestamp)) + min: MIN((End_Timestamp - Start_Timestamp)) + max: MAX((End_Timestamp - Start_Timestamp)) + unit: ns + Kernel Time (Cycles): + avg: AVG($GRBM_GUI_ACTIVE_PER_XCD) + min: MIN($GRBM_GUI_ACTIVE_PER_XCD) + max: MAX($GRBM_GUI_ACTIVE_PER_XCD) + unit: Cycle + Instructions per wavefront: + avg: AVG((SQ_INSTS / SQ_WAVES)) + min: MIN((SQ_INSTS / SQ_WAVES)) + max: MAX((SQ_INSTS / SQ_WAVES)) + unit: Instr/wavefront + Wave Cycles: + avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom)) + min: MIN(((4 * SQ_WAVE_CYCLES) / $denom)) + max: MAX(((4 * SQ_WAVE_CYCLES) / $denom)) + unit: (Cycles + $normUnit) + Dependency Wait Cycles: + avg: AVG(((4 * SQ_WAIT_ANY) / $denom)) + min: MIN(((4 * SQ_WAIT_ANY) / $denom)) + max: MAX(((4 * SQ_WAIT_ANY) / $denom)) + unit: (Cycles + $normUnit) + Issue Wait Cycles: + avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom)) + min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom)) + max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom)) + unit: (Cycles + $normUnit) + Active Cycles: + avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom)) + min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom)) + max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom)) + unit: (Cycles + $normUnit) + Wavefront Occupancy: + avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + unit: Wavefronts + coll_level: SQ_LEVEL_WAVES diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1000_compute-unit-instruction-mix.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1000_compute-unit-instruction-mix.yaml deleted file mode 100644 index 83ba5367a7..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1000_compute-unit-instruction-mix.yaml +++ /dev/null @@ -1,277 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1000 - title: Compute Units - Instruction Mix - data source: - - metric_table: - id: 1001 - title: Overall Instruction Mix - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - VALU: - avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) - min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) - max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) - unit: (instr + $normUnit) - tips: - VMEM: - # TODO: need to fix this when the new FLAT/LDS counts - # are present in ROCm - avg: AVG(((SQ_INSTS_VMEM) / $denom)) - min: MIN(((SQ_INSTS_VMEM) / $denom)) - max: MAX(((SQ_INSTS_VMEM) / $denom)) - unit: (instr + $normUnit) - tips: - LDS: - # TODO: need to fix this when the new FLAT/LDS counts - # are present in ROCm - avg: AVG((SQ_INSTS_LDS / $denom)) - min: MIN((SQ_INSTS_LDS / $denom)) - max: MAX((SQ_INSTS_LDS / $denom)) - unit: (instr + $normUnit) - tips: - MFMA: - avg: AVG((SQ_INSTS_MFMA / $denom)) - min: MIN((SQ_INSTS_MFMA / $denom)) - max: MAX((SQ_INSTS_MFMA / $denom)) - unit: (instr + $normUnit) - tips: - SALU: - avg: AVG((SQ_INSTS_SALU / $denom)) - min: MIN((SQ_INSTS_SALU / $denom)) - max: MAX((SQ_INSTS_SALU / $denom)) - unit: (instr + $normUnit) - tips: - SMEM: - avg: AVG((SQ_INSTS_SMEM / $denom)) - min: MIN((SQ_INSTS_SMEM / $denom)) - max: MAX((SQ_INSTS_SMEM / $denom)) - unit: (instr + $normUnit) - tips: - Branch: - avg: AVG((SQ_INSTS_BRANCH / $denom)) - min: MIN((SQ_INSTS_BRANCH / $denom)) - max: MAX((SQ_INSTS_BRANCH / $denom)) - unit: (instr + $normUnit) - tips: - - - metric_table: - id: 1002 - title: VALU Arithmetic Instr Mix - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - INT32: - avg: AVG((SQ_INSTS_VALU_INT32 / $denom)) - min: MIN((SQ_INSTS_VALU_INT32 / $denom)) - max: MAX((SQ_INSTS_VALU_INT32 / $denom)) - unit: (instr + $normUnit) - tips: - INT64: - avg: AVG((SQ_INSTS_VALU_INT64 / $denom)) - min: MIN((SQ_INSTS_VALU_INT64 / $denom)) - max: MAX((SQ_INSTS_VALU_INT64 / $denom)) - unit: (instr + $normUnit) - tips: - F16-ADD: - avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom)) - unit: (instr + $normUnit) - tips: - F16-MUL: - avg: AVG((SQ_INSTS_VALU_MUL_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_MUL_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_MUL_F16 / $denom)) - unit: (instr + $normUnit) - tips: - F16-FMA: - avg: AVG((SQ_INSTS_VALU_FMA_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_FMA_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_FMA_F16 / $denom)) - unit: (instr + $normUnit) - tips: - F16-Trans: - avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom)) - unit: (instr + $normUnit) - tips: - F32-ADD: - avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom)) - unit: (instr + $normUnit) - tips: - F32-MUL: - avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom)) - unit: (instr + $normUnit) - tips: - F32-FMA: - avg: AVG((SQ_INSTS_VALU_FMA_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_FMA_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_FMA_F32 / $denom)) - unit: (instr + $normUnit) - tips: - F32-Trans: - avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom)) - unit: (instr + $normUnit) - tips: - F64-ADD: - avg: AVG((SQ_INSTS_VALU_ADD_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_ADD_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_ADD_F64 / $denom)) - unit: (instr + $normUnit) - tips: - F64-MUL: - avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom)) - unit: (instr + $normUnit) - tips: - F64-FMA: - avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom)) - unit: (instr + $normUnit) - tips: - F64-Trans: - avg: AVG((SQ_INSTS_VALU_TRANS_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_TRANS_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_TRANS_F64 / $denom)) - unit: (instr + $normUnit) - tips: - Conversion: - avg: AVG((SQ_INSTS_VALU_CVT / $denom)) - min: MIN((SQ_INSTS_VALU_CVT / $denom)) - max: MAX((SQ_INSTS_VALU_CVT / $denom)) - unit: (instr + $normUnit) - tips: - - - metric_table: - id: 1003 - title: VMEM Instr Mix - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Global/Generic Instr: - avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Global/Generic Read: - avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Global/Generic Write: - avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Global/Generic Atomic: - avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Spill/Stack Instr: - avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Spill/Stack Read: - avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Spill/Stack Write: - avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Spill/Stack Atomic: - avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - - - metric_table: - id: 1004 - title: MFMA Arithmetic Instr Mix - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - MFMA-I8: - avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom)) - unit: (instr + $normUnit) - tips: - MFMA-F8: - avg: AVG((SQ_INSTS_VALU_MFMA_F8 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F8 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F8 / $denom)) - unit: (instr + $normUnit) - tips: - MFMA-F16: - avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom)) - unit: (instr + $normUnit) - tips: - MFMA-BF16: - avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom)) - unit: (instr + $normUnit) - tips: - MFMA-F32: - avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom)) - unit: (instr + $normUnit) - tips: - MFMA-F64: - avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom)) - unit: (instr + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1000_compute_units_instruction_mix.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1000_compute_units_instruction_mix.yaml new file mode 100644 index 0000000000..9c923d7bb7 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1000_compute_units_instruction_mix.yaml @@ -0,0 +1,309 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1000 + title: Compute Units - Instruction Mix + metrics_description: + VALU: The total number of vector arithmetic logic unit (VALU) operations issued. + These are the workhorses of the compute unit, and are used to execute a wide + range of instruction types including floating point operations, non-uniform + address calculations, transcendental operations, integer operations, shifts, + conditional evaluation, etc. + VMEM: The total number of vector memory operations issued. These include most + loads, stores and atomic operations and all accesses to generic, global, private + and texture memory. + LDS: The total number of LDS (also known as shared memory) operations issued. + These include loads, stores, atomics, and HIP's __shfl operations. + MFMA: The total number of matrix fused multiply-add instructions issued. + SALU: The total number of scalar arithmetic logic unit (SALU) operations issued. + Typically these are used for address calculations, literal constants, and other + operations that are provably uniform across a wavefront. Although scalar memory + (SMEM) operations are issued by the SALU, they are counted separately in this + section. + SMEM: The total number of scalar memory (SMEM) operations issued. These are typically + used for loading kernel arguments, base-pointers and loads from HIP's __constant__ + memory. + Branch: The total number of branch operations issued. These typically consist + of jump or branch operations and are used to implement control flow. + INT32: The total number of instructions operating on 32-bit integer operands issued + to the VALU per normalization unit. + INT64: The total number of instructions operating on 64-bit integer operands issued + to the VALU per normalization unit. + F16-ADD: The total number of addition instructions operating on 16-bit floating-point + operands issued to the VALU per normalization unit. + F16-MUL: The total number of multiplication instructions operating on 16-bit floating-point + operands issued to the VALU per normalization unit. + F16-FMA: The total number of fused multiply-add instructions operating on 16-bit + floating-point operands issued to the VALU per normalization unit. + F16-Trans: The total number of transcendental instructions (e.g., sqrt) operating + on 16-bit floating-point operands issued to the VALU per normalization unit. + F32-ADD: The total number of addition instructions operating on 32-bit floating-point + operands issued to the VALU per normalization unit. + F32-MUL: The total number of multiplication instructions operating on 32-bit floating-point + operands issued to the VALU per normalization unit. + F32-FMA: The total number of fused multiply-add instructions operating on 32-bit + floating-point operands issued to the VALU per normalization unit. + F32-Trans: The total number of transcendental instructions (such as sqrt) operating + on 32-bit floating-point operands issued to the VALU per normalization unit. + F64-ADD: The total number of addition instructions operating on 64-bit floating-point + operands issued to the VALU per normalization unit. + F64-MUL: The total number of multiplication instructions operating on 64-bit floating-point + operands issued to the VALU per normalization unit. + F64-FMA: The total number of fused multiply-add instructions operating on 64-bit + floating-point operands issued to the VALU per normalization unit. + F64-Trans: The total number of transcendental instructions (such as sqrt) operating + on 64-bit floating-point operands issued to the VALU per normalization unit. + Conversion: "The total number of type conversion instructions (such as converting\ + \ data to or from F32\u2194F64) issued to the VALU per normalization unit." + Global/Generic Instr: The total number of global & generic memory instructions + executed on all compute units on the accelerator, per normalization unit. + Global/Generic Read: The total number of global & generic memory read instructions + executed on all compute units on the accelerator, per normalization unit. + Global/Generic Write: The total number of global & generic memory write instructions + executed on all compute units on the accelerator, per normalization unit. + Global/Generic Atomic: The total number of global & generic memory atomic (with + and without return) instructions executed on all compute units on the accelerator, + per normalization unit. + Spill/Stack Instr: The total number of spill/stack memory instructions executed + on all compute units on the accelerator, per normalization unit. + Spill/Stack Read: The total number of spill/stack memory read instructions executed + on all compute units on the accelerator, per normalization unit. + Spill/Stack Write: The total number of spill/stack memory write instructions executed + on all compute units on the accelerator, per normalization unit. + Spill/Stack Atomic: The total number of spill/stack memory atomic (with and without + return) instructions executed on all compute units on the accelerator, per normalization + unit. Typically unused as these memory operations are typically used to implement + thread-local storage. + MFMA-I8: The total number of 8-bit integer MFMA instructions issued per normalization + unit. + MFMA-F8: The total number of 8-bit floating point MFMA instructions issued per + normalization unit. This is supported in AMD Instinct MI300 series and later + only. + MFMA-F16: The total number of 16-bit floating point MFMA instructions issued per + normalization unit. + MFMA-BF16: The total number of 16-bit brain floating point MFMA instructions issued + per normalization unit. + MFMA-F32: The total number of 32-bit floating-point MFMA instructions issued per + normalization unit. + MFMA-F64: The total number of 64-bit floating-point MFMA instructions issued per + normalization unit. + data source: + - metric_table: + id: 1001 + title: Overall Instruction Mix + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + VALU: + avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) + min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) + max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) + unit: (instr + $normUnit) + VMEM: + avg: AVG(((SQ_INSTS_VMEM) / $denom)) + min: MIN(((SQ_INSTS_VMEM) / $denom)) + max: MAX(((SQ_INSTS_VMEM) / $denom)) + unit: (instr + $normUnit) + LDS: + avg: AVG((SQ_INSTS_LDS / $denom)) + min: MIN((SQ_INSTS_LDS / $denom)) + max: MAX((SQ_INSTS_LDS / $denom)) + unit: (instr + $normUnit) + MFMA: + avg: AVG((SQ_INSTS_MFMA / $denom)) + min: MIN((SQ_INSTS_MFMA / $denom)) + max: MAX((SQ_INSTS_MFMA / $denom)) + unit: (instr + $normUnit) + SALU: + avg: AVG((SQ_INSTS_SALU / $denom)) + min: MIN((SQ_INSTS_SALU / $denom)) + max: MAX((SQ_INSTS_SALU / $denom)) + unit: (instr + $normUnit) + SMEM: + avg: AVG((SQ_INSTS_SMEM / $denom)) + min: MIN((SQ_INSTS_SMEM / $denom)) + max: MAX((SQ_INSTS_SMEM / $denom)) + unit: (instr + $normUnit) + Branch: + avg: AVG((SQ_INSTS_BRANCH / $denom)) + min: MIN((SQ_INSTS_BRANCH / $denom)) + max: MAX((SQ_INSTS_BRANCH / $denom)) + unit: (instr + $normUnit) + - metric_table: + id: 1002 + title: VALU Arithmetic Instruction Mix + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + INT32: + avg: AVG((SQ_INSTS_VALU_INT32 / $denom)) + min: MIN((SQ_INSTS_VALU_INT32 / $denom)) + max: MAX((SQ_INSTS_VALU_INT32 / $denom)) + unit: (instr + $normUnit) + INT64: + avg: AVG((SQ_INSTS_VALU_INT64 / $denom)) + min: MIN((SQ_INSTS_VALU_INT64 / $denom)) + max: MAX((SQ_INSTS_VALU_INT64 / $denom)) + unit: (instr + $normUnit) + F16-ADD: + avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom)) + unit: (instr + $normUnit) + F16-MUL: + avg: AVG((SQ_INSTS_VALU_MUL_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_MUL_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_MUL_F16 / $denom)) + unit: (instr + $normUnit) + F16-FMA: + avg: AVG((SQ_INSTS_VALU_FMA_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_FMA_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_FMA_F16 / $denom)) + unit: (instr + $normUnit) + F16-Trans: + avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom)) + unit: (instr + $normUnit) + F32-ADD: + avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom)) + unit: (instr + $normUnit) + F32-MUL: + avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom)) + unit: (instr + $normUnit) + F32-FMA: + avg: AVG((SQ_INSTS_VALU_FMA_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_FMA_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_FMA_F32 / $denom)) + unit: (instr + $normUnit) + F32-Trans: + avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom)) + unit: (instr + $normUnit) + F64-ADD: + avg: AVG((SQ_INSTS_VALU_ADD_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_ADD_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_ADD_F64 / $denom)) + unit: (instr + $normUnit) + F64-MUL: + avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom)) + unit: (instr + $normUnit) + F64-FMA: + avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom)) + unit: (instr + $normUnit) + F64-Trans: + avg: AVG((SQ_INSTS_VALU_TRANS_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_TRANS_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_TRANS_F64 / $denom)) + unit: (instr + $normUnit) + Conversion: + avg: AVG((SQ_INSTS_VALU_CVT / $denom)) + min: MIN((SQ_INSTS_VALU_CVT / $denom)) + max: MAX((SQ_INSTS_VALU_CVT / $denom)) + unit: (instr + $normUnit) + - metric_table: + id: 1003 + title: VMEM Instruction Mix + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Global/Generic Instr: + avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Global/Generic Read: + avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Global/Generic Write: + avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Global/Generic Atomic: + avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Instr: + avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Read: + avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Write: + avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Atomic: + avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + - metric_table: + id: 1004 + title: MFMA Arithmetic Instruction Mix + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + MFMA-I8: + avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom)) + unit: (instr + $normUnit) + MFMA-F8: + avg: AVG((SQ_INSTS_VALU_MFMA_F8 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F8 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F8 / $denom)) + unit: (instr + $normUnit) + MFMA-F16: + avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom)) + unit: (instr + $normUnit) + MFMA-BF16: + avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + unit: (instr + $normUnit) + MFMA-F32: + avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom)) + unit: (instr + $normUnit) + MFMA-F64: + avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom)) + unit: (instr + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1100_compute-unit-compute-pipeline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1100_compute-unit-compute-pipeline.yaml deleted file mode 100644 index 95fbf70a9c..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1100_compute-unit-compute-pipeline.yaml +++ /dev/null @@ -1,273 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1100 - title: Compute Units - Compute Pipeline - data source: - - metric_table: - id: 1101 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - peak: Peak - pop: Pct of Peak - tips: Tips - metric: - VALU FLOPs: - value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) - + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) - + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + (64 * (((SQ_INSTS_VALU_ADD_F64 - + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (2 * SQ_INSTS_VALU_FMA_F64)))) - / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) / (((($max_sclk - * $cu_per_gpu) * 64) * 2) / 1000)) - tips: - VALU IOPs: - value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - Start_Timestamp))) - unit: GIOP - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) - tips: - MFMA FLOPs (F8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - tips: - MFMA FLOPs (BF16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) - tips: - MFMA FLOPs (F16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) - tips: - MFMA FLOPs (F32): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - tips: - MFMA FLOPs (F64): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - tips: - MFMA IOPs (INT8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GIOP - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - tips: All Peak FLOPS/clock/CU come from https://github.com/ROCm/amd_matrix_instruction_calculator/ - - - metric_table: - id: 1102 - title: Pipeline Stats - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - IPC: - avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - unit: Instr/cycle - tips: - IPC (Issued): - avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) - + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) - / SQ_ACTIVE_INST_ANY)) - min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) - + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) - / SQ_ACTIVE_INST_ANY)) - max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) - + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) - / SQ_ACTIVE_INST_ANY)) - unit: Instr/cycle - tips: - SALU Utilization: - avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - tips: - VALU Utilization: - avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - tips: - VMEM Utilization: - avg: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - tips: - Branch Utilization: - avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - tips: - VALU Active Threads: - avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - unit: Threads - tips: - MFMA Utilization: - avg: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) - min: MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) - max: MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) - unit: pct - tips: - MFMA Instr Cycles: - avg: AVG(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA != 0) - else None)) - min: MIN(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA != 0) - else None)) - max: MAX(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA != 0) - else None)) - unit: cycles/instr - tips: - VMEM Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) - else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) - else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) - else None)) - unit: Cycles - coll_level: SQ_INST_LEVEL_VMEM - tips: - SMEM Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) - else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) - else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) - else None)) - unit: Cycles - coll_level: SQ_INST_LEVEL_SMEM - tips: - - - metric_table: - id: 1103 - title: Arithmetic Operations - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - FLOPs (Total): - avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) - + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 - * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) - + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / - $denom)) - min: MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) - + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 - * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) - + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / - $denom)) - max: MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) - + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 - * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) - + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / - $denom)) - unit: (OPs + $normUnit) - tips: - IOPs (Total): - avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / $denom) - min: MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / $denom) - max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / $denom) - unit: (OPs + $normUnit) - tips: - F8 OPs: - avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) - min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) - max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) - unit: (OPs + $normUnit) - tips: - F16 OPs: - avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + - (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 * - SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) - min: MIN(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + - (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 * - SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) - max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + - (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 * - SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) - unit: (OPs + $normUnit) - tips: - BF16 OPs: - avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) - min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) - max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) - unit: (OPs + $normUnit) - tips: - F32 OPs: - avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) / $denom)) - min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) / $denom)) - max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) / $denom)) - unit: (OPs + $normUnit) - tips: - F64 OPs: - avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) - min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) - max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) - unit: (OPs + $normUnit) - tips: - INT8 OPs: - avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) - min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) - max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) - unit: (OPs + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1100_compute_units_compute_pipeline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1100_compute_units_compute_pipeline.yaml new file mode 100644 index 0000000000..5285c6b279 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1100_compute_units_compute_pipeline.yaml @@ -0,0 +1,330 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1100 + title: Compute Units - Compute Pipeline + metrics_description: + VALU FLOPs: 'The total floating-point operations executed per second on the VALU. + This is also presented as a percent of the peak theoretical FLOPs achievable + on the specific accelerator. Note: this does not include any floating-point + operations from MFMA instructions.' + VALU IOPs: 'The total integer operations executed per second on the VALU. This + is also presented as a percent of the peak theoretical IOPs achievable on the + specific accelerator. Note: this does not include any integer operations from + MFMA instructions.' + MFMA FLOPs (BF16): 'The total number of 16-bit brain floating point MFMA operations + executed per second. Note: this does not include any 16-bit brain floating point + operations from VALU instructions. This is also presented as a percent of the + peak theoretical BF16 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F16): 'The total number of 16-bit floating point MFMA operations executed + per second. Note: this does not include any 16-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F16 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F32): 'The total number of 32-bit floating point MFMA operations executed + per second. Note: this does not include any 32-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F32 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F64): 'The total number of 64-bit floating point MFMA operations executed + per second. Note: this does not include any 64-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F64 MFMA operations achievable on the specific accelerator.' + MFMA IOPs (INT8): 'The total number of 8-bit integer MFMA operations executed + per second. Note: this does not include any 8-bit integer operations from VALU + instructions. This is also presented as a percent of the peak theoretical INT8 + MFMA operations achievable on the specific accelerator.' + IPC: The ratio of the total number of instructions executed on the CU over the + total active CU cycles. + IPC (Issued): The ratio of the total number of (non-internal) instructions issued + over the number of cycles where the scheduler was actively working on issuing + instructions. + SALU Utilization: Indicates what percent of the kernel's duration the SALU was + busy executing instructions. Computed as the ratio of the total number of cycles + spent by the scheduler issuing SALU / SMEM instructions over the total CU cycles. + VALU Utilization: Indicates what percent of the kernel's duration the VALU was + busy executing instructions. Does not include VMEM operations. Computed as the + ratio of the total number of cycles spent by the scheduler issuing VALU instructions + over the total CU cycles. + VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit + was busy executing instructions, including both global/generic and spill/scratch + operations (see the VMEM instruction count metrics for more detail). Does not + include VALU operations. Computed as the ratio of the total number of cycles + spent by the scheduler issuing VMEM instructions over the total CU cycles. + Branch Utilization: Indicates what percent of the kernel's duration the branch + unit was busy executing instructions. Computed as the ratio of the total number + of cycles spent by the scheduler issuing branch instructions over the total + CU cycles. + VALU Active Threads: Indicates the average level of divergence within a wavefront + over the lifetime of the kernel. The number of work-items that were active in + a wavefront during execution of each VALU instruction, time-averaged over all + VALU instructions run on all wavefronts in the kernel + MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit + was busy executing instructions. Computed as the ratio of the total number of + cycles spent by the MFMA was busy over the total CU cycles. + MFMA Instruction Cycles: The average duration of MFMA instructions in this kernel + in cycles. Computed as the ratio of the total number of cycles the MFMA unit + was busy over the total number of MFMA instructions. + VMEM Latency: The average number of round-trip cycles (that is, from issue to + data return / acknowledgment) required for a VMEM instruction to complete. + SMEM Latency: The average number of round-trip cycles (that is, from issue to + data return / acknowledgment) required for a SMEM instruction to complete. + FLOPs (Total): The total number of floating-point operations executed on either + the VALU or MFMA units, per normalization unit. + IOPs (Total): The total number of integer operations executed on either the VALU + or MFMA units, per normalization unit. + F16 OPs: The total number of 16-bit floating-point operations executed on either + the VALU or MFMA units, per normalization unit. + BF16 OPs: The total number of 16-bit brain floating-point operations executed + on either the VALU or MFMA units, per normalization unit. + F32 OPs: The total number of 32-bit floating-point operations executed on either + the VALU or MFMA units, per normalization unit. + F64 OPs: The total number of 64-bit floating-point operations executed on either + the VALU or MFMA units, per normalization unit. + INT8 OPs: The total number of 8-bit integer operations executed on either the + VALU or MFMA units, per normalization unit. + data source: + - metric_table: + id: 1101 + title: Compute Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + peak: Peak + pop: Pct of Peak + metric: + VALU FLOPs: + value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) + / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) + VALU IOPs: + value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp + - Start_Timestamp))) + unit: GIOP + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp + - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) + MFMA FLOPs (F8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + MFMA FLOPs (BF16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) + MFMA FLOPs (F16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) + MFMA FLOPs (F32): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + MFMA FLOPs (F64): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + MFMA IOPs (INT8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GIOP + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + - metric_table: + id: 1102 + title: Pipeline Statistics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + IPC: + avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + unit: Instr/cycle + IPC (Issued): + avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) + / SQ_ACTIVE_INST_ANY)) + min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) + / SQ_ACTIVE_INST_ANY)) + max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) + / SQ_ACTIVE_INST_ANY)) + unit: Instr/cycle + SALU Utilization: + avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + VALU Utilization: + avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + VMEM Utilization: + avg: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + min: MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + max: MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + unit: pct + Branch Utilization: + avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + min: MIN((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + max: MAX((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + VALU Active Threads: + avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + unit: Threads + MFMA Utilization: + avg: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) + min: MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) + max: MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) + unit: pct + MFMA Instruction Cycles: + avg: AVG(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA != + 0) else None)) + min: MIN(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA != + 0) else None)) + max: MAX(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA != + 0) else None)) + unit: cycles/instr + VMEM Latency: + avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) + else None)) + min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) + else None)) + max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) + else None)) + unit: Cycles + coll_level: SQ_INST_LEVEL_VMEM + SMEM Latency: + avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) + else None)) + min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) + else None)) + max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) + else None)) + unit: Cycles + coll_level: SQ_INST_LEVEL_SMEM + - metric_table: + id: 1103 + title: Arithmetic Operations + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + FLOPs (Total): + avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + min: MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + max: MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + unit: (OPs + $normUnit) + IOPs (Total): + avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 + * 512)) / $denom) + min: MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 + * 512)) / $denom) + max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 + * 512)) / $denom) + unit: (OPs + $normUnit) + F8 OPs: + avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) + min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) + max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) + unit: (OPs + $normUnit) + F16 OPs: + avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 + * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) + min: MIN(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 + * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) + max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 + * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) + unit: (OPs + $normUnit) + BF16 OPs: + avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) + min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) + max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) + unit: (OPs + $normUnit) + F32 OPs: + avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + / $denom)) + min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + / $denom)) + max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + / $denom)) + unit: (OPs + $normUnit) + F64 OPs: + avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + unit: (OPs + $normUnit) + INT8 OPs: + avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + unit: (OPs + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1200_lds.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1200_lds.yaml deleted file mode 100644 index 797178a0b3..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1200_lds.yaml +++ /dev/null @@ -1,119 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1200 - title: Local Data Share (LDS) - data source: - - metric_table: - id: 1201 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - tips: Tips - metric: - Utilization: - value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: Pct of Peak - tips: - Access Rate: - value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: Pct of Peak - tips: - Theoretical Bandwidth (% of Peak): - value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * 0.00128))) - unit: Pct of Peak - tips: - Bank Conflict Rate: - value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - unit: Pct of Peak - tips: - comparable: false # for now - cli_style: simple_bar - tui_style: simple_bar - - - metric_table: - id: 1202 - title: LDS Stats - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - LDS Instrs: - avg: AVG((SQ_INSTS_LDS / $denom)) - min: MIN((SQ_INSTS_LDS / $denom)) - max: MAX((SQ_INSTS_LDS / $denom)) - unit: (Instr + $normUnit) - tips: - Theoretical Bandwidth: - avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / $denom)) - min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / $denom)) - max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / $denom)) - unit: (Bytes + $normUnit) - tips: - LDS Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)) - unit: Cycles - coll_level: SQ_INST_LEVEL_LDS - tips: - Bank Conflicts/Access: - avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - unit: Conflicts/Access - tips: - Index Accesses: - avg: AVG((SQ_LDS_IDX_ACTIVE / $denom)) - min: MIN((SQ_LDS_IDX_ACTIVE / $denom)) - max: MAX((SQ_LDS_IDX_ACTIVE / $denom)) - unit: (Cycles + $normUnit) - tips: - Atomic Return Cycles: - avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom)) - min: MIN((SQ_LDS_ATOMIC_RETURN / $denom)) - max: MAX((SQ_LDS_ATOMIC_RETURN / $denom)) - unit: (Cycles + $normUnit) - tips: - Bank Conflict: - avg: AVG((SQ_LDS_BANK_CONFLICT / $denom)) - min: MIN((SQ_LDS_BANK_CONFLICT / $denom)) - max: MAX((SQ_LDS_BANK_CONFLICT / $denom)) - unit: (Cycles + $normUnit) - tips: - Addr Conflict: - avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom)) - min: MIN((SQ_LDS_ADDR_CONFLICT / $denom)) - max: MAX((SQ_LDS_ADDR_CONFLICT / $denom)) - unit: (Cycles + $normUnit) - tips: - Unaligned Stall: - avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom)) - min: MIN((SQ_LDS_UNALIGNED_STALL / $denom)) - max: MAX((SQ_LDS_UNALIGNED_STALL / $denom)) - unit: (Cycles + $normUnit) - tips: - Mem Violations: - avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom)) - min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom)) - max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom)) - unit: (Accesses + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1200_local_data_share_lds.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1200_local_data_share_lds.yaml new file mode 100644 index 0000000000..c1a8525348 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1200_local_data_share_lds.yaml @@ -0,0 +1,141 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1200 + title: Local Data Share (LDS) + metrics_description: + Utilization: Indicates what percent of the kernel's duration the LDS was actively + executing instructions (including, but not limited to, load, store, atomic and + HIP's __shfl operations). Calculated as the ratio of the total number of cycles + LDS was active over the total CU cycles. + Access Rate: Indicates the percentage of SIMDs in the VALU actively issuing LDS + instructions, averaged over the lifetime of the kernel. Calculated as the ratio + of the total number of cycles spent by the scheduler issuing LDS instructions + over the total CU cycles. + Theoretical Bandwidth: Indicates the maximum amount of bytes that could have been + loaded from, stored to, or atomically updated in the LDS per normalization unit. + Does not take into account the execution mask of the wavefront when the instruction + was executed. + Bank Conflict Rate: Indicates the percentage of active LDS cycles that were spent + servicing bank conflicts. Calculated as the ratio of LDS cycles spent servicing + bank conflicts over the number of LDS cycles that would have been required to + move the same amount of data in an uncontended access. + LDS Instructions: The total number of LDS instructions (including, but not limited + to, read/write/atomics and HIP's __shfl instructions) executed per normalization + unit. + LDS Latency: The average number of round-trip cycles (i.e., from issue to data-return + / acknowledgment) required for an LDS instruction to complete. + Bank Conflicts/Access: The ratio of the number of cycles spent in the LDS scheduler + due to bank conflicts (as determined by the conflict resolution hardware) to + the base number of cycles that would be spent in the LDS scheduler in a completely + uncontended case. This is the unnormalized form of the Bank Conflict Rate. + Index Accesses: The total number of cycles spent in the LDS scheduler over all + operations per normalization unit. + Atomic Return Cycles: The total number of cycles spent on LDS atomics with return + per normalization unit. + Bank Conflict: The total number of cycles spent in the LDS scheduler due to bank + conflicts (as determined by the conflict resolution hardware) per normalization + unit. + Addr Conflict: The total number of cycles spent in the LDS scheduler due to address + conflicts (as determined by the conflict resolution hardware) per normalization + unit. + Unaligned Stall: The total number of cycles spent in the LDS scheduler due to + stalls from non-dword aligned addresses per normalization unit. + Mem Violations: "The total number of out-of-bounds accesses made to the LDS, per\ + \ normalization unit. This is unused and expected to be zero in most configurations\ + \ for modern CDNA\u2122 accelerators." + data source: + - metric_table: + id: 1201 + title: LDS Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + metric: + Utilization: + value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: Pct of Peak + Access Rate: + value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: Pct of Peak + Theoretical Bandwidth (% of Peak): + value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * 0.00128))) + unit: Pct of Peak + Bank Conflict Rate: + value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + unit: Pct of Peak + comparable: false + cli_style: simple_bar + tui_style: simple_bar + - metric_table: + id: 1202 + title: LDS Statistics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + LDS Instructions: + avg: AVG((SQ_INSTS_LDS / $denom)) + min: MIN((SQ_INSTS_LDS / $denom)) + max: MAX((SQ_INSTS_LDS / $denom)) + unit: (Instr + $normUnit) + Theoretical Bandwidth: + avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / $denom)) + min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / $denom)) + max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / $denom)) + unit: (Bytes + $normUnit) + LDS Latency: + avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else + None)) + min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else + None)) + max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else + None)) + unit: Cycles + coll_level: SQ_INST_LEVEL_LDS + Bank Conflicts/Access: + avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + unit: Conflicts/Access + Index Accesses: + avg: AVG((SQ_LDS_IDX_ACTIVE / $denom)) + min: MIN((SQ_LDS_IDX_ACTIVE / $denom)) + max: MAX((SQ_LDS_IDX_ACTIVE / $denom)) + unit: (Cycles + $normUnit) + Atomic Return Cycles: + avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom)) + min: MIN((SQ_LDS_ATOMIC_RETURN / $denom)) + max: MAX((SQ_LDS_ATOMIC_RETURN / $denom)) + unit: (Cycles + $normUnit) + Bank Conflict: + avg: AVG((SQ_LDS_BANK_CONFLICT / $denom)) + min: MIN((SQ_LDS_BANK_CONFLICT / $denom)) + max: MAX((SQ_LDS_BANK_CONFLICT / $denom)) + unit: (Cycles + $normUnit) + Addr Conflict: + avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom)) + min: MIN((SQ_LDS_ADDR_CONFLICT / $denom)) + max: MAX((SQ_LDS_ADDR_CONFLICT / $denom)) + unit: (Cycles + $normUnit) + Unaligned Stall: + avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom)) + min: MIN((SQ_LDS_UNALIGNED_STALL / $denom)) + max: MAX((SQ_LDS_UNALIGNED_STALL / $denom)) + unit: (Cycles + $normUnit) + Mem Violations: + avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom)) + min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom)) + max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom)) + unit: (Accesses + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1300_instruction-cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1300_instruction-cache.yaml deleted file mode 100644 index 7db7c09337..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1300_instruction-cache.yaml +++ /dev/null @@ -1,106 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1300 - title: Instruction Cache - data source: - - metric_table: - id: 1301 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - tips: Tips - metric: - Bandwidth: - value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) - * (End_Timestamp - Start_Timestamp)))) - unit: Pct of Peak - tips: - Cache Hit Rate: - value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - unit: Pct of Peak - tips: - L1I-L2 Bandwidth: - value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu) - * (End_Timestamp - Start_Timestamp)))) - unit: Pct of Peak - tips: - comparable: false # for now - cli_style: simple_bar - tui_style: simple_bar - - - metric_table: - id: 1302 - title: Instruction Cache Accesses - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Req: - avg: AVG((SQC_ICACHE_REQ / $denom)) - min: MIN((SQC_ICACHE_REQ / $denom)) - max: MAX((SQC_ICACHE_REQ / $denom)) - unit: (Req + $normUnit) - tips: - Hits: - avg: AVG((SQC_ICACHE_HITS / $denom)) - min: MIN((SQC_ICACHE_HITS / $denom)) - max: MAX((SQC_ICACHE_HITS / $denom)) - unit: (Hits + $normUnit) - tips: - Misses - Non Duplicated: - avg: AVG((SQC_ICACHE_MISSES / $denom)) - min: MIN((SQC_ICACHE_MISSES / $denom)) - max: MAX((SQC_ICACHE_MISSES / $denom)) - unit: (Misses + $normUnit) - tips: - Misses - Duplicated: - avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom)) - min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom)) - max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom)) - unit: (Misses + $normUnit) - tips: - Cache Hit Rate: - avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - unit: pct - tips: - Instruction Fetch Latency: - avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - unit: Cycles - coll_level: SQ_IFETCH_LEVEL - tips: - - metric_table: - id: 1303 - title: Instruction Cache - L2 Interface - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - L1I-L2 Bandwidth: - avg: AVG(((SQC_TC_INST_REQ * 64) / $denom)) - min: MIN(((SQC_TC_INST_REQ * 64) / $denom)) - max: MAX(((SQC_TC_INST_REQ * 64) / $denom)) - unit: (Bytes + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1300_instruction_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1300_instruction_cache.yaml new file mode 100644 index 0000000000..a53c23691f --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1300_instruction_cache.yaml @@ -0,0 +1,106 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1300 + title: Instruction Cache + metrics_description: + Bandwidth: The number of bytes looked up in the L1I cache, as a percent of the + peak theoretical bandwidth. Calculated as the ratio of L1I requests over the + total L1I cycles. + Cache Hit Rate: The percent of L1I requests that hit [#l1i-cache]_ on a previously + loaded line the cache. Calculated as the ratio of the number of L1I requests + that hit over the number of all L1I requests. + L1I-L2 Bandwidth: "The percent of the peak theoretical L1I \u2192 L2 cache request\ + \ bandwidth achieved. Calculated as the ratio of the total number of requests\ + \ from the L1I to the L2 cache over the total L1I-L2 interface cycles." + Req: The total number of requests made to the L1I per normalization-unit + Hits: The total number of L1I requests that hit on a previously loaded cache line, + per normalization-unit. + Misses - Non Duplicated: The total number of L1I requests that missed on a cache + line that were not already pending due to another request, per normalization-unit. + Misses - Duplicated: The total number of L1I requests that missed on a cache line + that were already pending due to another request, per normalization-unit. + Instruction Fetch Latency: The average number of cycles spent to fetch instructions + to a CU. + data source: + - metric_table: + id: 1301 + title: L1I Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + metric: + Bandwidth: + value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * (End_Timestamp + - Start_Timestamp)))) + unit: Pct of Peak + Cache Hit Rate: + value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + unit: Pct of Peak + L1I-L2 Bandwidth: + value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu) + * (End_Timestamp - Start_Timestamp)))) + unit: Pct of Peak + comparable: false + cli_style: simple_bar + tui_style: simple_bar + - metric_table: + id: 1302 + title: L1I cache accesses + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Req: + avg: AVG((SQC_ICACHE_REQ / $denom)) + min: MIN((SQC_ICACHE_REQ / $denom)) + max: MAX((SQC_ICACHE_REQ / $denom)) + unit: (Req + $normUnit) + Hits: + avg: AVG((SQC_ICACHE_HITS / $denom)) + min: MIN((SQC_ICACHE_HITS / $denom)) + max: MAX((SQC_ICACHE_HITS / $denom)) + unit: (Hits + $normUnit) + Misses - Non Duplicated: + avg: AVG((SQC_ICACHE_MISSES / $denom)) + min: MIN((SQC_ICACHE_MISSES / $denom)) + max: MAX((SQC_ICACHE_MISSES / $denom)) + unit: (Misses + $normUnit) + Misses - Duplicated: + avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom)) + min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom)) + max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom)) + unit: (Misses + $normUnit) + Cache Hit Rate: + avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + unit: pct + Instruction Fetch Latency: + avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + unit: Cycles + coll_level: SQ_IFETCH_LEVEL + - metric_table: + id: 1303 + title: L1I <-> L2 interface + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + L1I-L2 Bandwidth: + avg: AVG(((SQC_TC_INST_REQ * 64) / $denom)) + min: MIN(((SQC_TC_INST_REQ * 64) / $denom)) + max: MAX(((SQC_TC_INST_REQ * 64) / $denom)) + unit: (Bytes + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1400_constant-cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1400_constant-cache.yaml deleted file mode 100644 index 90befb4a03..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1400_constant-cache.yaml +++ /dev/null @@ -1,172 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1400 - title: Scalar L1 Data Cache - data source: - - metric_table: - id: 1401 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - tips: Tips - metric: - Bandwidth: - value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) - * (End_Timestamp - Start_Timestamp)))) - unit: Pct of Peak - tips: - Cache Hit Rate: - value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES + SQC_DCACHE_MISSES_DUPLICATE)) - if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - unit: Pct of Peak - tips: - sL1D-L2 BW: - value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 100000) - / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp))) - unit: Pct of Peak - tips: - comparable: false # for now - cli_style: simple_bar - tui_style: simple_bar - - - metric_table: - id: 1402 - title: Scalar L1D Cache Accesses - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Req: - avg: AVG((SQC_DCACHE_REQ / $denom)) - min: MIN((SQC_DCACHE_REQ / $denom)) - max: MAX((SQC_DCACHE_REQ / $denom)) - unit: (Req + $normUnit) - tips: - Hits: - avg: AVG((SQC_DCACHE_HITS / $denom)) - min: MIN((SQC_DCACHE_HITS / $denom)) - max: MAX((SQC_DCACHE_HITS / $denom)) - unit: (Req + $normUnit) - tips: - Misses - Non Duplicated: - avg: AVG((SQC_DCACHE_MISSES / $denom)) - min: MIN((SQC_DCACHE_MISSES / $denom)) - max: MAX((SQC_DCACHE_MISSES / $denom)) - unit: (Req + $normUnit) - tips: - Misses- Duplicated: - avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom)) - min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom)) - max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom)) - unit: (Req + $normUnit) - tips: - Cache Hit Rate: - avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - unit: pct - tips: - Read Req (Total): - avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) - + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) - min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) - + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) - max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) - + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) - unit: (Req + $normUnit) - tips: - Atomic Req: - avg: AVG((SQC_DCACHE_ATOMIC / $denom)) - min: MIN((SQC_DCACHE_ATOMIC / $denom)) - max: MAX((SQC_DCACHE_ATOMIC / $denom)) - unit: (Req + $normUnit) - tips: - Read Req (1 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_1 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_1 / $denom)) - unit: (Req + $normUnit) - tips: - Read Req (2 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_2 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_2 / $denom)) - unit: (Req + $normUnit) - tips: - Read Req (4 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_4 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_4 / $denom)) - unit: (Req + $normUnit) - tips: - Read Req (8 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_8 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_8 / $denom)) - unit: (Req + $normUnit) - tips: - Read Req (16 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_16 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_16 / $denom)) - unit: (Req + $normUnit) - tips: - - - metric_table: - id: 1403 - title: Scalar L1D Cache - L2 Interface - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - sL1D-L2 BW: - avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom)) - min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom)) - max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom)) - unit: (Bytes + $normUnit) - tips: - Read Req: - avg: AVG((SQC_TC_DATA_READ_REQ / $denom)) - min: MIN((SQC_TC_DATA_READ_REQ / $denom)) - max: MAX((SQC_TC_DATA_READ_REQ / $denom)) - unit: (Req + $normUnit) - tips: - Write Req: - avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom)) - min: MIN((SQC_TC_DATA_WRITE_REQ / $denom)) - max: MAX((SQC_TC_DATA_WRITE_REQ / $denom)) - unit: (Req + $normUnit) - tips: - Atomic Req: - avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)) - min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom)) - max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom)) - unit: (Req + $normUnit) - tips: - Stall Cycles: - avg: AVG((SQC_TC_STALL / $denom)) - min: MIN((SQC_TC_STALL / $denom)) - max: MAX((SQC_TC_STALL / $denom)) - unit: (Cycles + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1400_scalar_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1400_scalar_l1_data_cache.yaml new file mode 100644 index 0000000000..d43157ce8e --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1400_scalar_l1_data_cache.yaml @@ -0,0 +1,186 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1400 + title: Scalar L1 Data Cache + metrics_description: + Bandwidth: The number of bytes looked up in the sL1D cache, as a percent of the + peak theoretical bandwidth. Calculated as the ratio of sL1D requests over the + total sL1D cycles. + Cache Hit Rate: Indicates the percent of sL1D requests that hit on a previously + loaded line the cache. The ratio of the number of sL1D requests that hit over + the number of all sL1D requests. + sL1D-L2 BW: "The total number of bytes read from, written to, or atomically updated\ + \ across the sL1D\u2194L2 interface, per normalization unit. Note that sL1D\ + \ writes and atomics are typically unused on current CDNA accelerators, so in\ + \ the majority of cases this can be interpreted as an sL1D\u2192L2 read bandwidth." + Req: The total number of requests, of any size or type, made to the sL1D per normalization + unit. + Hits: The total number of sL1D requests that hit on a previously loaded cache + line, per normalization unit. + Misses - Non Duplicated: 'The total number of sL1D requests that missed on a cache + line that was not already pending due to another request, per normalization + unit. ' + Misses- Duplicated: The total number of sL1D requests that missed on a cache line + that was already pending due to another request, per normalization unit. + Read Req (Total): The total number of sL1D read requests of any size, per normalization + unit. + Atomic Req: The total number of atomic requests from sL1D to the L2, per normalization + unit. Typically unused on current CDNA accelerators. + Read Req (1 DWord): The total number of sL1D read requests made for a single dword + of data (4B), per normalization unit. + Read Req (2 DWord): The total number of sL1D read requests made for a two dwords + of data (8B), per normalization unit. + Read Req (4 DWord): The total number of sL1D read requests made for a four dwords + of data (16B), per normalization unit. + Read Req (8 DWord): The total number of sL1D read requests made for a eight dwords + of data (32B), per normalization unit. + Read Req (16 DWord): The total number of sL1D read requests made for a sixteen + dwords of data (64B), per normalization unit. + Read Req: The total number of read requests from sL1D to the L2 per normalization + unit. + Write Req: The total number of write requests from sL1D to the L2, per normalization + unit. Typically unused on current CDNA accelerators. + Stall Cycles: "The total number of cycles the sL1D\u2194L2 interface was stalled,\ + \ per normalization unit." + data source: + - metric_table: + id: 1401 + title: Scalar L1D Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + metric: + Bandwidth: + value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * (End_Timestamp + - Start_Timestamp)))) + unit: Pct of Peak + Cache Hit Rate: + value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES + + SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + unit: Pct of Peak + sL1D-L2 BW: + value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp))) + unit: Pct of Peak + comparable: false + cli_style: simple_bar + tui_style: simple_bar + - metric_table: + id: 1402 + title: Scalar L1D cache accesses + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Req: + avg: AVG((SQC_DCACHE_REQ / $denom)) + min: MIN((SQC_DCACHE_REQ / $denom)) + max: MAX((SQC_DCACHE_REQ / $denom)) + unit: (Req + $normUnit) + Hits: + avg: AVG((SQC_DCACHE_HITS / $denom)) + min: MIN((SQC_DCACHE_HITS / $denom)) + max: MAX((SQC_DCACHE_HITS / $denom)) + unit: (Req + $normUnit) + Misses - Non Duplicated: + avg: AVG((SQC_DCACHE_MISSES / $denom)) + min: MIN((SQC_DCACHE_MISSES / $denom)) + max: MAX((SQC_DCACHE_MISSES / $denom)) + unit: (Req + $normUnit) + Misses- Duplicated: + avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom)) + min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom)) + max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom)) + unit: (Req + $normUnit) + Cache Hit Rate: + avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + unit: pct + Read Req (Total): + avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) + min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) + max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG((SQC_DCACHE_ATOMIC / $denom)) + min: MIN((SQC_DCACHE_ATOMIC / $denom)) + max: MAX((SQC_DCACHE_ATOMIC / $denom)) + unit: (Req + $normUnit) + Read Req (1 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_1 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_1 / $denom)) + unit: (Req + $normUnit) + Read Req (2 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_2 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_2 / $denom)) + unit: (Req + $normUnit) + Read Req (4 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_4 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_4 / $denom)) + unit: (Req + $normUnit) + Read Req (8 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_8 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_8 / $denom)) + unit: (Req + $normUnit) + Read Req (16 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_16 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_16 / $denom)) + unit: (Req + $normUnit) + - metric_table: + id: 1403 + title: Scalar L1D Cache - L2 Interface + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + sL1D-L2 BW: + avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 64)) / $denom)) + min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 64)) / $denom)) + max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 64)) / $denom)) + unit: (Bytes + $normUnit) + Read Req: + avg: AVG((SQC_TC_DATA_READ_REQ / $denom)) + min: MIN((SQC_TC_DATA_READ_REQ / $denom)) + max: MAX((SQC_TC_DATA_READ_REQ / $denom)) + unit: (Req + $normUnit) + Write Req: + avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom)) + min: MIN((SQC_TC_DATA_WRITE_REQ / $denom)) + max: MAX((SQC_TC_DATA_WRITE_REQ / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)) + min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom)) + max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom)) + unit: (Req + $normUnit) + Stall Cycles: + avg: AVG((SQC_TC_STALL / $denom)) + min: MIN((SQC_TC_STALL / $denom)) + max: MAX((SQC_TC_STALL / $denom)) + unit: (Cycles + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1500_TA_and_TD.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1500_TA_and_TD.yaml deleted file mode 100644 index 8994d0b17d..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1500_TA_and_TD.yaml +++ /dev/null @@ -1,174 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1500 - title: Address Processing Unit and Data Return Path (TA/TD) - data source: - - metric_table: - id: 1501 - title: Address Processing Unit - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Address Processing Unit Busy: - avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Address Stall: - avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Data Stall: - avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Data-Processor → Address Stall: - avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Total Instructions: - avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom)) - min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom)) - max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Global/Generic Instructions: - avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Global/Generic Read Instructions: - avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Global/Generic Write Instructions: - avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Global/Generic Atomic Instructions: - avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Spill/Stack Instructions: - avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Spill/Stack Read Instructions: - avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Spill/Stack Write Instructions: - avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Spill/Stack Atomic Instructions: - avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Spill/Stack Total Cycles: - avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) - min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) - max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) - unit: (Cycles + $normUnit) - tips: - Spill/Stack Coalesced Read: - avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) - min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) - max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) - unit: (Cycles + $normUnit) - tips: - Spill/Stack Coalesced Write: - avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) - min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) - max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) - unit: (Cycles + $normUnit) - tips: - - - metric_table: - id: 1502 - title: Data-Return Path - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Data-Return Busy: - avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Cache RAM → Data-Return Stall: - avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Workgroup manager → Data-Return Stall: - avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Coalescable Instructions: - avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom)) - min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom)) - max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Read Instructions: - avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) - / $denom)) - min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) - / $denom)) - max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) - / $denom)) - unit: (Instructions + $normUnit) - tips: - Write Instructions: - avg: AVG((TD_STORE_WAVEFRONT_sum / $denom)) - min: MIN((TD_STORE_WAVEFRONT_sum / $denom)) - max: MAX((TD_STORE_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Atomic Instructions: - avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom)) - min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom)) - max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1500_address_processing_unit_and_data_return_path_ta_td.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1500_address_processing_unit_and_data_return_path_ta_td.yaml new file mode 100644 index 0000000000..f920234926 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1500_address_processing_unit_and_data_return_path_ta_td.yaml @@ -0,0 +1,248 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1500 + title: Address Processing Unit and Data Return Path (TA/TD) + metrics_description: + Address Processing Unit Busy: Percent of the total CU cycles the address processor + was busy + Address Stall: Percent of the total CU cycles the address processor was stalled + from sending address requests further into the vL1D pipeline. + Data Stall: Percent of the total CU cycles the address processor was stalled from + sending write/atomic data further into the vL1D pipeline. + "Data-Processor \u2192 Address Stall": Percent of total CU cycles the address + processor was stalled waiting to send command data to the data processor. + Total Instructions: The total number of memory instructions executed by the address + processer over all compute units on the accelerator, per normalization unit. + Global/Generic Instructions: The total number of global & generic memory instructions + executed on all compute units on the accelerator, per normalization unit. + Global/Generic Read Instructions: The total number of global & generic memory + read instructions executed on all compute units on the accelerator, per normalization + unit. + Global/Generic Write Instructions: The total number of global & generic memory + write instructions executed on all compute units on the accelerator, per normalization + unit. + Global/Generic Atomic Instructions: The total number of global & generic memory + atomic (with and without return) instructions executed on all compute units + on the accelerator, per normalization unit. + Spill/Stack Instructions: The total number of spill/stack memory instructions + executed on all compute units on the accelerator, per normalization unit. + Spill/Stack Read Instructions: The total number of spill/stack memory read instructions + executed on all compute units on the accelerator, per normalization unit. + Spill/Stack Write Instructions: The total number of spill/stack memory write instructions + executed on all compute units on the accelerator, per normalization unit. + Spill/Stack Atomic Instructions: The total number of spill/stack memory atomic + (with and without return) instructions executed on all compute units on the + accelerator, per normalization unit. Typically unused as these memory operations + are typically used to implement thread-local storage. + Spill/Stack Total Cycles: The number of cycles the address processing unit spent + working on spill/stack instructions, per normalization unit. + Spill/Stack Coalesced Read: The number of cycles the address processing unit spent + working on coalesced spill/stack read instructions, per normalization unit. + Spill/Stack Coalesced Write: The number of cycles the address processing unit + spent working on coalesced spill/stack write instructions, per normalization + unit. + Data-Return Busy: Percent of the total CU cycles the data-return unit was busy + processing or waiting on data to return to the CU. + "Cache RAM \u2192 Data-Return Stall": Percent of the total CU cycles the data-return + unit was stalled on data to be returned from the vL1D Cache RAM. + "Workgroup manager \u2192 Data-Return Stall": Percent of the total CU cycles the + data-return unit was stalled by the workgroup manager due to initialization + of registers as a part of launching new workgroups. + Coalescable Instructions: The number of instructions submitted to the data-return + unit by the address processor that were found to be coalescable, per normalization + unit. + Read Instructions: The number of read instructions submitted to the data-return + unit by the address processor summed over all compute units on the accelerator, + per normalization unit. This is expected to be the sum of global/generic and + spill/stack reads in the address processor. + Write Instructions: The number of store instructions submitted to the data-return + unit by the address processor summed over all compute units on the accelerator, + per normalization unit. This is expected to be the sum of global/generic and + spill/stack stores in the address processor. + Atomic Instructions: The number of atomic instructions submitted to the data-return + unit by the address processor summed over all compute units on the accelerator, + per normalization unit. This is expected to be the sum of global/generic and + spill/stack atomics in the address processor. + data source: + - metric_table: + id: 1501 + title: Busy and stall metrics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Address Processing Unit Busy: + avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + Address Stall: + avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + unit: pct + Data Stall: + avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + unit: pct + "Data-Processor \u2192 Address Stall": + avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + unit: pct + "Sequencer \u2192 TA Address Stall": + avg: AVG((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) + min: MIN((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) + max: MAX((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) + "Sequencer \u2192 TA Command Stall": + avg: AVG((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) + min: MIN((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) + max: MAX((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) + "Sequencer \u2192 TA Data Stall": + avg: AVG((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) + min: MIN((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) + max: MAX((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) + - metric_table: + id: 1502 + title: Instruction counts + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Total Instructions: + avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom)) + min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom)) + max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Instructions: + avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Read Instructions: + avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Write Instructions: + avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Atomic Instructions: + avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Instructions: + avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Read Instructions: + avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Write Instructions: + avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Atomic Instructions: + avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + - metric_table: + id: 1503 + title: Spill and stack metrics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Spill/Stack Total Cycles: + avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) + min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) + max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) + unit: (Cycles + $normUnit) + Spill/Stack Coalesced Read: + avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) + min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) + max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) + unit: (Cycles + $normUnit) + Spill/Stack Coalesced Write: + avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) + min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) + max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) + unit: (Cycles + $normUnit) + - metric_table: + id: 1504 + title: Vector L1 data-return path or Texture Data (TD) + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Data-Return Busy: + avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + "Cache RAM \u2192 Data-Return Stall": + avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + "Workgroup manager \u2192 Data-Return Stall": + avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + Coalescable Instructions: + avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom)) + min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom)) + max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) + Read Instructions: + avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) + / $denom)) + min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) + / $denom)) + max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) + / $denom)) + unit: (Instructions + $normUnit) + Write Instructions: + avg: AVG((TD_STORE_WAVEFRONT_sum / $denom)) + min: MIN((TD_STORE_WAVEFRONT_sum / $denom)) + max: MAX((TD_STORE_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) + Atomic Instructions: + avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom)) + min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom)) + max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1600_L1_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1600_L1_cache.yaml deleted file mode 100644 index ef81f78e5c..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1600_L1_cache.yaml +++ /dev/null @@ -1,389 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1600 - title: Vector L1 Data Cache - data source: - - metric_table: - id: 1601 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - tips: Tips - metric: - Hit rate: - value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - unit: Pct of Peak - tips: - Bandwidth: - value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) - unit: Pct of Peak - tips: - Utilization: - value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None)) - unit: Pct of Peak - tips: - Coalescing: - value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum - * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None)) - unit: Pct of Peak - tips: - comparable: false # for now - cli_style: simple_bar - tui_style: simple_bar - - - metric_table: - id: 1602 - title: L1D Cache Stalls (%) - header: - metric: Metric - expr: Expression - tips: Tips - metric: - Stalled on L2 Data: - expr: - (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None) - tips: - Stalled on L2 Req: - expr: - (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None) - tips: - Stalled on Address: - expr: - None - tips: - Stalled on Data: - expr: - None - tips: - Stalled on Latency FIFO: - expr: - None - tips: - Stalled on Request FIFO: - expr: - None - tips: - Stalled on Read Return: - expr: - None - tips: - Tag RAM Stall (Read): - expr: - (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None) - tips: - Tag RAM Stall (Write): - expr: - (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None) - tips: - Tag RAM Stall (Atomic): - expr: - (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None) - tips: - cli_style: simple_box - tui_style: simple_box - - - metric_table: - id: 1603 - title: L1D Cache Accesses - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Total Req: - avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom)) - min: MIN((TCP_TOTAL_ACCESSES_sum / $denom)) - max: MAX((TCP_TOTAL_ACCESSES_sum / $denom)) - unit: (Req + $normUnit) - tips: - Read Req: - avg: AVG((TCP_TOTAL_READ_sum / $denom)) - min: MIN((TCP_TOTAL_READ_sum / $denom)) - max: MAX((TCP_TOTAL_READ_sum / $denom)) - unit: (Req + $normUnit) - tips: - Write Req: - avg: AVG((TCP_TOTAL_WRITE_sum / $denom)) - min: MIN((TCP_TOTAL_WRITE_sum / $denom)) - max: MAX((TCP_TOTAL_WRITE_sum / $denom)) - unit: (Req + $normUnit) - tips: - Atomic Req: - avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)) - min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)) - max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)) - unit: (Req + $normUnit) - tips: - Cache BW: - avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / $denom)) - min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / $denom)) - max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / $denom)) - unit: (Bytes + $normUnit) - tips: - Cache Hit Rate: - avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / - TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / - TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / - TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - unit: pct - tips: - Cache Accesses: - avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) - min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) - max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) - unit: (Req + $normUnit) - tips: - Cache Hits: - avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / $denom)) - min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / $denom)) - max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / $denom)) - unit: (Req + $normUnit) - tips: - Invalidations: - avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) - min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) - max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) - unit: (Req + $normUnit) - tips: - L1-L2 BW: - avg: AVG(((128 * TCP_TCC_READ_REQ_sum + 64 * - (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + - TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) - min: MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * - (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + - TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) - max: MAX(((128 * TCP_TCC_READ_REQ_sum + 64 * - (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + - TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) - unit: (Bytes + $normUnit) - tips: - L1-L2 Read: - avg: AVG((TCP_TCC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - L1-L2 Write: - avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - L1-L2 Atomic: - avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)) - min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)) - max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)) - unit: (Req + $normUnit) - tips: - - - metric_table: - id: 1604 - title: L1D - L2 Transactions - header: - metric: Metric - xfer: Xfer - coherency: Coherency - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - NC - Read: - xfer: Read - coherency: NC - avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - UC - Read: - xfer: Read - coherency: UC - avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - CC - Read: - xfer: Read - coherency: CC - avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - RW - Read: - xfer: Read - coherency: RW - avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - RW - Write: - xfer: Write - coherency: RW - avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - NC - Write: - xfer: Write - coherency: NC - avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - UC - Write: - xfer: Write - coherency: UC - avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - CC - Write: - xfer: Write - coherency: CC - avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - NC - Atomic: - xfer: Atomic - coherency: NC - avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - UC - Atomic: - xfer: Atomic - coherency: UC - avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - CC - Atomic: - xfer: Atomic - coherency: CC - avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - RW - Atomic: - xfer: Atomic - coherency: RW - avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - - - metric_table: - id: 1605 - title: L1D Addr Translation - header: - metric: Metric - avg: Avg - min: Min - max: Max - units: Units - tips: Tips - metric: - Req: - avg: AVG((TCP_UTCL1_REQUEST_sum / $denom)) - min: MIN((TCP_UTCL1_REQUEST_sum / $denom)) - max: MAX((TCP_UTCL1_REQUEST_sum / $denom)) - units: (Req + $normUnit) - tips: - Inflight Req: - avg: None # Missing perfmon - min: None # Missing perfmon - max: None # Missing perfmon - units: (Req + $normUnit) - tips: - Hit Ratio: - avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) if - (TCP_UTCL1_REQUEST_sum != 0) else None)) - min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) if - (TCP_UTCL1_REQUEST_sum != 0) else None)) - max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) if - (TCP_UTCL1_REQUEST_sum != 0) else None)) - units: pct - tips: - Hits: - avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) - min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) - max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) - units: (Req + $normUnit) - tips: - Translation Misses: - avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) - min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) - max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) - units: (Req + $normUnit) - tips: - Permission Misses: - avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) - min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) - max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) - units: (Req + $normUnit) - tips: - - metric_table: - id: 1606 - title: L1D Addr Translation Stalls - header: - metric: Metric - avg: Avg - min: Min - max: Max - units: Units - tips: Tips - metric: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1600_vector_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1600_vector_l1_data_cache.yaml new file mode 100644 index 0000000000..708bbafe14 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1600_vector_l1_data_cache.yaml @@ -0,0 +1,412 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1600 + title: Vector L1 Data Cache + metrics_description: + Hit rate: The ratio of the number of vL1D cache line requests that hit in vL1D + cache over the total number of cache line requests to the vL1D Cache RAM. + Bandwidth: The number of bytes looked up in the vL1D cache as a result of VMEM + instructions, as a percent of the peak theoretical bandwidth achievable on the + specific accelerator. The number of bytes is calculated as the number of cache + lines requested multiplied by the cache line size. This value does not consider + partial requests, so for instance, if only a single value is requested in a + cache line, the data movement will still be counted as a full cache line. + Utilization: Indicates how busy the vL1D Cache RAM was during the kernel execution. + The number of cycles where the vL1D Cache RAM is actively processing any request + divided by the number of cycles where the vL1D is active. + Coalescing: Indicates how well memory instructions were coalesced by the address + processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). Calculated + as the average number of thread-requests generated per instruction divided by + the ideal number of thread-requests per instruction. + Stalled on L2 Data: The ratio of the number of cycles where the vL1D is stalled + waiting for requested data to return from the L2 cache divided by the number + of cycles where the vL1D is active. + Stalled on L2 Req: The ratio of the number of cycles where the vL1D is stalled + waiting to issue a request for data to the L2 cache divided by the number of + cycles where the vL1D is active. + Tag RAM Stall (Read): The ratio of the number of cycles where the vL1D is stalled + due to Read requests with conflicting tags being looked up concurrently, divided + by the number of cycles where the vL1D is active. + Tag RAM Stall (Write): The ratio of the number of cycles where the vL1D is stalled + due to Write requests with conflicting tags being looked up concurrently, divided + by the number of cycles where the vL1D is active. + Tag RAM Stall (Atomic): The ratio of the number of cycles where the vL1D is stalled + due to Atomic requests with conflicting tags being looked up concurrently, divided + by the number of cycles where the vL1D is active. + Total Req: The total number of incoming requests from the address processing unit + after coalescing. + Read Req: The total number of incoming read requests from the address processing + unit after coalescing per normalization unit. + Write Req: The total number of incoming write requests from the address processing + unit after coalescing per normalization unit. + Atomic Req: The total number of incoming atomic requests from the address processing + unit after coalescing per normalization unit. + Cache BW: The number of bytes looked up in the vL1D cache as a result of VMEM + instructions per normalization unit. The number of bytes is calculated as the + number of cache lines requested multiplied by the cache line size. This value + does not consider partial requests, so for instance, if only a single value + is requested in a cache line, the data movement will still be counted as a full + cache line. + Cache Hit Rate: The ratio of the number of vL1D cache line requests that hit in + vL1D cache over the total number of cache line requests to the vL1D Cache RAM. + Cache Accesses: The total number of cache line lookups in the vL1D. + Cache Hits: The number of cache accesses minus the number of outgoing requests + to the L2 cache, that is, the number of cache line requests serviced by the + vL1D Cache RAM per normalization unit. + Invalidations: The number of times the vL1D was issued a write-back invalidate + command during the kernel's execution per normalization unit. This may be triggered + by, for instance, the buffer_wbinvl1 instruction. + L1-L2 BW: The number of bytes transferred across the vL1D-L2 interface as a result + of VMEM instructions, per normalization unit. The number of bytes is calculated + as the number of cache lines requested multiplied by the cache line size. This + value does not consider partial requests, so for instance, if only a single + value is requested in a cache line, the data movement will still be counted + as a full cache line. + L1-L2 Read: The number of read requests for a vL1D cache line that were not satisfied + by the vL1D and must be retrieved from the to the L2 Cache per normalization + unit. + L1-L2 Write: The number of write requests to a vL1D cache line that were sent + through the vL1D to the L2 cache, per normalization unit. + L1-L2 Atomic: The number of atomic requests that are sent through the vL1D to + the L2 cache, per normalization unit. This includes requests for atomics with, + and without return. + L1 Access Latency: Calculated as the average number of cycles that a vL1D cache + line request spent in the vL1D cache pipeline. + L1-L2 Read Latency: Calculated as the average number of cycles that the vL1D cache + took to issue and receive read requests from the L2 Cache. This number also + includes requests for atomics with return values. + L1-L2 Write Latency: Calculated as the average number of cycles that the vL1D + cache took to issue and receive acknowledgement of a write request to the L2 + Cache. This number also includes requests for atomics without return values. + NC - Read: Total read requests with NC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + UC - Read: Total read requests with UC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + CC - Read: Total read requests with CC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + RW - Read: Total read requests with RW mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + RW - Write: Total write requests with RW mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + NC - Write: Total write requests with NC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + UC - Write: Total write requests with UC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + CC - Write: Total write requests with CC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + NC - Atomic: Total atomic requests with NC mtype from this TCP to all TCCs Sum + over TCP instances per normalization unit. + UC - Atomic: Total atomic requests with UC mtype from this TCP to all TCCs Sum + over TCP instances per normalization unit. + CC - Atomic: Total atomic requests with CC mtype from this TCP to all TCCs Sum + over TCP instances per normalization unit. + RW - Atomic: Total atomic requests with RW mtype from this TCP to all TCCs Sum + over TCP instances per normalization unit. + Req: The number of translation requests made to the UTCL1 per normalization unit. + Hit Ratio: The ratio of the number of translation requests that hit in the UTCL1 + divided by the total number of translation requests made to the UTCL1. + Hits: The number of translation requests that hit in the UTCL1, and could be reused, + per normalization unit. + Translation Misses: The total number of translation requests that missed in the + UTCL1 due to translation not being present in the cache, per normalization + unit. + Permission Misses: "The total number of translation requests that missed in the\ + \ UTCL1 due to a permission error, per normalization unit. This is unused and\ + \ expected to be zero in most configurations for modern CDNA\u2122 accelerators." + data source: + - metric_table: + id: 1601 + title: vL1D Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + metric: + Hit rate: + value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None)) + unit: Pct of Peak + Bandwidth: + value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) + unit: Pct of Peak + Utilization: + value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None)) + unit: Pct of Peak + Coalescing: + value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum + * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None)) + unit: Pct of Peak + comparable: false + cli_style: simple_bar + tui_style: simple_bar + - metric_table: + id: 1602 + title: vL1D cache stall metrics + header: + metric: Metric + expr: Expression + metric: + Stalled on L2 Data: + expr: (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None) + Stalled on L2 Req: + expr: (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None) + Tag RAM Stall (Read): + expr: (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + Tag RAM Stall (Write): + expr: (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + Tag RAM Stall (Atomic): + expr: (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1603 + title: vL1D cache access metrics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Total Req: + avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom)) + min: MIN((TCP_TOTAL_ACCESSES_sum / $denom)) + max: MAX((TCP_TOTAL_ACCESSES_sum / $denom)) + unit: (Req + $normUnit) + Read Req: + avg: AVG((TCP_TOTAL_READ_sum / $denom)) + min: MIN((TCP_TOTAL_READ_sum / $denom)) + max: MAX((TCP_TOTAL_READ_sum / $denom)) + unit: (Req + $normUnit) + Write Req: + avg: AVG((TCP_TOTAL_WRITE_sum / $denom)) + min: MIN((TCP_TOTAL_WRITE_sum / $denom)) + max: MAX((TCP_TOTAL_WRITE_sum / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + unit: (Req + $normUnit) + Cache BW: + avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / $denom)) + min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / $denom)) + max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / $denom)) + unit: (Bytes + $normUnit) + Cache Hit Rate: + avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None)) + min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None)) + max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None)) + unit: pct + Cache Accesses: + avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + unit: (Req + $normUnit) + Cache Hits: + avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + unit: (Req + $normUnit) + Invalidations: + avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + unit: (Req + $normUnit) + L1-L2 BW: + avg: AVG(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) + min: MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) + max: MAX(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) + unit: (Bytes + $normUnit) + L1-L2 Read: + avg: AVG((TCP_TCC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + L1-L2 Write: + avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + L1-L2 Atomic: + avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + unit: (Req + $normUnit) + - metric_table: + id: 1604 + title: L1D - L2 Transactions + header: + metric: Metric + xfer: Xfer + coherency: Coherency + avg: Avg + min: Min + max: Max + unit: Unit + metric: + NC - Read: + xfer: Read + coherency: NC + avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC - Read: + xfer: Read + coherency: UC + avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC - Read: + xfer: Read + coherency: CC + avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW - Read: + xfer: Read + coherency: RW + avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW - Write: + xfer: Write + coherency: RW + avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + NC - Write: + xfer: Write + coherency: NC + avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC - Write: + xfer: Write + coherency: UC + avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC - Write: + xfer: Write + coherency: CC + avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + NC - Atomic: + xfer: Atomic + coherency: NC + avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC - Atomic: + xfer: Atomic + coherency: UC + avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC - Atomic: + xfer: Atomic + coherency: CC + avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW - Atomic: + xfer: Atomic + coherency: RW + avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + - metric_table: + id: 1605 + title: L1 Unified Translation Cache (UTCL1) + header: + metric: Metric + avg: Avg + min: Min + max: Max + units: Units + metric: + Req: + avg: AVG((TCP_UTCL1_REQUEST_sum / $denom)) + min: MIN((TCP_UTCL1_REQUEST_sum / $denom)) + max: MAX((TCP_UTCL1_REQUEST_sum / $denom)) + units: (Req + $normUnit) + Hit Ratio: + avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) + if (TCP_UTCL1_REQUEST_sum != 0) else None)) + min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) + if (TCP_UTCL1_REQUEST_sum != 0) else None)) + max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) + if (TCP_UTCL1_REQUEST_sum != 0) else None)) + units: pct + Hits: + avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + units: (Req + $normUnit) + Translation Misses: + avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + units: (Req + $normUnit) + Permission Misses: + avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + units: (Req + $normUnit) + - metric_table: + id: 1606 + title: L1D Addr Translation Stalls + header: + metric: Metric + avg: Avg + min: Min + max: Max + units: Units + metric: {} diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1700_L2_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1700_L2_cache.yaml deleted file mode 100644 index 4ce319bf9e..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1700_L2_cache.yaml +++ /dev/null @@ -1,401 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1700 - title: L2 Cache - data source: - - metric_table: - id: 1701 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - tips: Tips - metric: - Utilization: - value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD))) - unit: pct - tips: - Bandwidth: - value: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) - unit: pct - tips: - Hit Rate: - value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else 0)) - unit: pct - tips: - L2-Fabric Read BW: - value: AVG((128 * TCC_BUBBLE_sum + - 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + - 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)) - unit: GB/s - tips: - L2-Fabric Write and Atomic BW: - value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - tips: - HBM Bandwidth: - value: $hbmBandwidth - unit: GB/s - tips: - - - metric_table: - id: 1702 - title: L2 - Fabric Transactions - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Read BW: - avg: AVG(((128 * TCC_BUBBLE_sum + - 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + - 32 * TCC_EA0_RDREQ_32B_sum) / $denom)) - min: MIN(((128 * TCC_BUBBLE_sum + - 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + - 32 * TCC_EA0_RDREQ_32B_sum) / $denom)) - max: MAX(((128 * TCC_BUBBLE_sum + - 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + - 32 * TCC_EA0_RDREQ_32B_sum) / $denom)) - unit: (Bytes + $normUnit) - tips: - HBM Read Traffic: - avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - unit: pct - tips: - Remote Read Traffic: - avg: AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - min: MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - max: MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - unit: pct - tips: - Uncached Read Traffic: - avg: AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - unit: pct - tips: - Write and Atomic BW: - avg: - AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / $denom)) - min: - MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / $denom)) - max: - MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / $denom)) - unit: (Bytes + $normUnit) - tips: - HBM Write and Atomic Traffic: - avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - unit: pct - tips: - Remote Write and Atomic Traffic: - avg: AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - min: MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - max: MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - unit: pct - tips: - Atomic Traffic: - avg: AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - unit: pct - tips: - Uncached Write and Atomic Traffic: - avg: AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - unit: pct - tips: - Read Latency: - avg: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != - 0) else None)) - min: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != - 0) else None)) - max: MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != - 0) else None)) - unit: Cycles - tips: - Write and Atomic Latency: - avg: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != - 0) else None)) - min: MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != - 0) else None)) - max: MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != - 0) else None)) - unit: Cycles - tips: - Atomic Latency: - avg: AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum - != 0) else None)) - min: MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum - != 0) else None)) - max: MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum - != 0) else None)) - unit: Cycles - tips: - - - metric_table: - id: 1703 - title: L2 Cache Accesses - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Bandwidth: - avg: AVG((TCC_REQ_sum * 128) / $denom) - min: MIN((TCC_REQ_sum * 128) / $denom) - max: MAX((TCC_REQ_sum * 128) / $denom) - unit: (Bytes + $normUnit) - tips: - Req: - avg: AVG((TCC_REQ_sum / $denom)) - min: MIN((TCC_REQ_sum / $denom)) - max: MAX((TCC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - Read Req: - avg: AVG((TCC_READ_sum / $denom)) - min: MIN((TCC_READ_sum / $denom)) - max: MAX((TCC_READ_sum / $denom)) - unit: (Req + $normUnit) - tips: - Write Req: - avg: AVG((TCC_WRITE_sum / $denom)) - min: MIN((TCC_WRITE_sum / $denom)) - max: MAX((TCC_WRITE_sum / $denom)) - unit: (Req + $normUnit) - tips: - Atomic Req: - avg: AVG((TCC_ATOMIC_sum / $denom)) - min: MIN((TCC_ATOMIC_sum / $denom)) - max: MAX((TCC_ATOMIC_sum / $denom)) - unit: (Req + $normUnit) - tips: - Streaming Req: - avg: AVG((TCC_STREAMING_REQ_sum / $denom)) - min: MIN((TCC_STREAMING_REQ_sum / $denom)) - max: MAX((TCC_STREAMING_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - Probe Req: - avg: AVG((TCC_PROBE_sum / $denom)) - min: MIN((TCC_PROBE_sum / $denom)) - max: MAX((TCC_PROBE_sum / $denom)) - unit: (Req + $normUnit) - tips: - Cache Hit: - avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - unit: pct - tips: - Hits: - avg: AVG((TCC_HIT_sum / $denom)) - min: MIN((TCC_HIT_sum / $denom)) - max: MAX((TCC_HIT_sum / $denom)) - unit: (Hits + $normUnit) - tips: - Misses: - avg: AVG((TCC_MISS_sum / $denom)) - min: MIN((TCC_MISS_sum / $denom)) - max: MAX((TCC_MISS_sum / $denom)) - unit: (Misses + $normUnit) - tips: - Writeback: - avg: AVG((TCC_WRITEBACK_sum / $denom)) - min: MIN((TCC_WRITEBACK_sum / $denom)) - max: MAX((TCC_WRITEBACK_sum / $denom)) - unit: (Cachelines + $normUnit) - tips: - Writeback (Internal): - avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom)) - min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom)) - max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom)) - unit: (Cachelines + $normUnit) - tips: - Writeback (vL1D Req): - avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - unit: (Cachelines + $normUnit) - tips: - Evict (Internal): - avg: AVG((TCC_NORMAL_EVICT_sum / $denom)) - min: MIN((TCC_NORMAL_EVICT_sum / $denom)) - max: MAX((TCC_NORMAL_EVICT_sum / $denom)) - unit: (Cachelines + $normUnit) - tips: - Evict (vL1D Req): - avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - unit: (Cachelines + $normUnit) - tips: - NC Req: - avg: AVG((TCC_NC_REQ_sum / $denom)) - min: MIN((TCC_NC_REQ_sum / $denom)) - max: MAX((TCC_NC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - UC Req: - avg: AVG((TCC_UC_REQ_sum / $denom)) - min: MIN((TCC_UC_REQ_sum / $denom)) - max: MAX((TCC_UC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - CC Req: - avg: AVG((TCC_CC_REQ_sum / $denom)) - min: MIN((TCC_CC_REQ_sum / $denom)) - max: MAX((TCC_CC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - RW Req: - avg: AVG((TCC_RW_REQ_sum / $denom)) - min: MIN((TCC_RW_REQ_sum / $denom)) - max: MAX((TCC_RW_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - - - metric_table: - id: 1704 - title: L2 Cache Stalls - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - - - metric_table: - id: 1705 - title: L2 - Fabric Interface Stalls - header: - metric: Metric - type: Type - transaction: Transaction - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - style: - type: simple_multi_bar - metric: - Write - Credit Starvation: - type: Credit Starvation - transaction: Write - avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - unit: pct - tips: - - - metric_table: - id: 1706 - title: L2 - Fabric Detailed Transaction Breakdown - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Read (32B): - avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom)) - min: MIN((TCC_EA0_RDREQ_32B_sum / $denom)) - max: MAX((TCC_EA0_RDREQ_32B_sum / $denom)) - unit: (Req + $normUnit) - tips: - Read (64B): - avg: AVG(((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) - min: MIN(((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) - max: MAX(((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) - unit: (Req + $normUnit) - tips: - Read (128B): - avg: AVG(((TCC_BUBBLE_sum) / $denom)) - min: MIN(((TCC_BUBBLE_sum) / $denom)) - max: MAX(((TCC_BUBBLE_sum) / $denom)) - unit: (Req + $normUnit) - tips: - Read (Uncached): - avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) - min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) - max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) - unit: (Req + $normUnit) - tips: - HBM Read: - avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)) - min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom)) - max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom)) - unit: (Req + $normUnit) - tips: - Remote Read: - avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - unit: (Req + $normUnit) - tips: - Write and Atomic (32B): - avg: AVG(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) - min: MIN(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) - max: MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) - unit: (Req + $normUnit) - tips: - Write and Atomic (Uncached): - avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) - min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) - max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) - unit: (Req + $normUnit) - tips: - Write and Atomic (64B): - avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom)) - min: MIN((TCC_EA0_WRREQ_64B_sum / $denom)) - max: MAX((TCC_EA0_WRREQ_64B_sum / $denom)) - unit: (Req + $normUnit) - tips: - HBM Write and Atomic: - avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)) - min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom)) - max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom)) - unit: (Req + $normUnit) - tips: - Remote Write and Atomic: - avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - unit: (Req + $normUnit) - tips: - Atomic: - avg: AVG((TCC_EA0_ATOMIC_sum / $denom)) - min: MIN((TCC_EA0_ATOMIC_sum / $denom)) - max: MAX((TCC_EA0_ATOMIC_sum / $denom)) - unit: (Req + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1700_l2_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1700_l2_cache.yaml new file mode 100644 index 0000000000..35777aa064 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1700_l2_cache.yaml @@ -0,0 +1,545 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1700 + title: L2 Cache + metrics_description: + Utilization: The ratio of the number of cycles an L2 channel was active, summed + over all L2 channels on the accelerator over the total L2 cycles. + Peak Bandwidth: The number of bytes looked up in the L2 cache, as a percent of + the peak theoretical bandwidth achievable on the specific accelerator. The number + of bytes is calculated as the number of cache lines requested multiplied by + the cache line size. This value does not consider partial requests, so e.g., + if only a single value is requested in a cache line, the data movement will + still be counted as a full cache line. + Hit Rate: The ratio of the number of L2 cache line requests that hit in the L2 + cache over the total number of incoming cache line requests to the L2 cache. + L2-Fabric Read BW: The number of bytes read by the L2 over the Infinity Fabric + interface per unit time. + L2-Fabric Write and Atomic BW: The number of bytes sent by the L2 over the Infinity + Fabric interface by write and atomic operations per unit time. + HBM Bandwidth: Maximum theoretical bandwidth of the accelerator's local high-bandwidth + memory (HBM) per unit time. This value is calculated as the number of HBM channels + multiplied by the HBM channel width multiplied by the HBM clock frequency. + Read BW: The total number of bytes read by the L2 cache from Infinity Fabric per + normalization unit. + HBM Read Traffic: The percent of read requests generated by the L2 cache that + are routed to the accelerator's local high-bandwidth memory (HBM). This breakdown + does not consider the size of the request (meaning that 32B and 64B requests + are both counted as a single request), so this metric only approximates the + percent of the L2-Fabric Read bandwidth directed to the local HBM. + Remote Read Traffic: The percent of read requests generated by the L2 cache that + are routed to any memory location other than the accelerator's local high-bandwidth + memory (HBM) - for example, the CPU's DRAM or a remote accelerator's HBM. This + breakdown does not consider the size of the request (meaning that 32B and 64B + requests are both counted as a single request), so this metric only approximates + the percent of the L2-Fabric Read bandwidth directed to a remote location. + Uncached Read Traffic: The percent of read requests generated by the L2 cache + that are reading from an uncached memory allocation. Note, as described in the + request flow section, a single 64B read request is typically counted as two + uncached read requests. So, it is possible for the Uncached Read Traffic to + reach up to 200% of the total number of read requests. This breakdown does not + consider the size of the request (i.e., 32B and 64B requests are both counted + as a single request), so this metric only approximates the percent of the L2-Fabric + read bandwidth directed to an uncached memory location. + Write and Atomic BW: The total number of bytes written by the L2 over Infinity + Fabric by write and atomic operations per normalization unit. Note that on current + CDNA accelerators, such as the MI2XX, requests are only considered atomic by + Infinity Fabric if they are targeted at non-write-cacheable memory, for example, + fine-grained memory allocations or uncached memory allocations on the MI2XX. + HBM Write and Atomic Traffic: The percent of write and atomic requests generated + by the L2 cache that are routed to the accelerator's local high-bandwidth memory + (HBM). This breakdown does not consider the size of the request (meaning that + 32B and 64B requests are both counted as a single request), so this metric only + approximates the percent of the L2-Fabric Write and Atomic bandwidth directed + to the local HBM. Note that on current CDNA accelerators, such as the MI2XX, + requests are only considered atomic by Infinity Fabric if they are targeted + at fine-grained memory allocations or uncached memory allocations. + Remote Write and Atomic Traffic: The percent of read requests generated by the + L2 cache that are routed to any memory location other than the accelerator's + local high-bandwidth memory (HBM) - for example, the CPU's DRAM or a remote + accelerator's HBM. This breakdown does not consider the size of the request + (meaning that 32B and 64B requests are both counted as a single request), so + this metric only approximates the percent of the L2-Fabric Read bandwidth directed + to a remote location. Note that on current CDNA accelerators, such as the MI2XX, + requests are only considered atomic by Infinity Fabric if they are targeted + at fine-grained memory allocations or uncached memory allocations. + Atomic Traffic: The percent of write requests generated by the L2 cache that are + atomic requests to any memory location. This breakdown does not consider the + size of the request (meaning that 32B and 64B requests are both counted as a + single request), so this metric only approximates the percent of the L2-Fabric + Read bandwidth directed to a remote location. Note that on current CDNA accelerators, + such as the MI2XX, requests are only considered atomic by Infinity Fabric if + they are targeted at fine-grained memory allocations or uncached memory allocations. + Uncached Write and Atomic Traffic: The percent of write and atomic requests generated + by the L2 cache that are targeting uncached memory allocations. This breakdown + does not consider the size of the request (meaning that 32B and 64B requests + are both counted as a single request), so this metric only approximates the + percent of the L2-Fabric read bandwidth directed to uncached memory allocations. + Read Latency: The time-averaged number of cycles read requests spent in Infinity + Fabric before data was returned to the L2. + Write and Atomic Latency: The time-averaged number of cycles write requests spent + in Infinity Fabric before a completion acknowledgement was returned to the L2. + Atomic Latency: The time-averaged number of cycles atomic requests spent in Infinity + Fabric before a completion acknowledgement (atomic without return value) or + data (atomic with return value) was returned to the L2. + Bandwidth: The number of bytes looked up in the L2 cache, per normalization unit. + The number of bytes is calculated as the number of cache lines requested multiplied + by the cache line size. This value does not consider partial requests, so for + example, if only a single value is requested in a cache line, the data movement + will still be counted as a full cache line. + Req: The total number of incoming requests to the L2 from all clients for all + request types, per normalization unit. + Read Req: The total number of read requests to the L2 from all clients. + Write Req: The total number of write requests to the L2 from all clients. + Atomic Req: The total number of atomic requests (with and without return) to the + L2 from all clients. + Streaming Req: The total number of incoming requests to the L2 that are marked + as streaming. The exact meaning of this may differ depending on the targeted + accelerator, however on an MI2XX this corresponds to non-temporal load or stores. + The L2 cache attempts to evict streaming requests before normal requests when + the L2 is at capacity. + Probe Req: The number of coherence probe requests made to the L2 cache from outside + the accelerator. On an MI2XX, probe requests may be generated by, for example, + writes to fine-grained device memory or by writes to coarse-grained device memory. + Cache Hit: The ratio of the number of L2 cache line requests that hit in the L2 + cache over the total number of incoming cache line requests to the L2 cache. + Hits: The total number of requests to the L2 from all clients that hit in the + cache. As noted in the Speed-of-Light section, this includes hit-on-miss requests. + Misses: The total number of requests to the L2 from all clients that miss in the + cache. As noted in the Speed-of-Light section, these do not include hit-on-miss + requests. + Writeback: The total number of L2 cache lines written back to memory for any reason. + Write-backs may occur due to user code (such as HIP kernel calls to _threadfence_system + or atomic built-ins) by the command processor's memory acquire/release fences, + or for other internal hardware reasons. + Writeback (Internal): The total number of L2 cache lines written back to memory + for internal hardware reasons, per normalization unit. + Writeback (vL1D Req): The total number of L2 cache lines written back to memory + due to requests initiated by the vL1D cache, per normalization unit. + Evict (Internal): The total number of L2 cache lines evicted from the cache due + to capacity limits, per normalization unit. + Evict (vL1D Req): The total number of L2 cache lines evicted from the cache due + to invalidation requests initiated by the vL1D cache, per normalization unit. + NC Req: The total number of requests to the L2 to Not-hardware-Coherent (NC) memory + allocations, per normalization unit. + UC Req: The total number of requests to the L2 that go to Uncached (UC) memory + allocations. + CC Req: The total number of requests to the L2 that go to Coherently Cacheable + (CC) memory allocations. + RW Req: The total number of requests to the L2 that go to Read-Write coherent + memory (RW) allocations. + Write - Credit Starvation: The number of cycles the L2-Fabric interface was stalled + on write or atomic requests to any memory location because too many write/atomic + requests were currently in flight, as a percent of the total active L2 cycles. + Read (32B): The total number of L2 requests to Infinity Fabric to read 32B of + data from any memory location, per normalization unit. + Read (64B): The total number of L2 requests to Infinity Fabric to read 64B of + data from any memory location, per normalization unit. + Read (Uncached): The total number of L2 requests to Infinity Fabric to read uncached + data from any memory location, per normalization unit. 64B requests for uncached + data are counted as two 32B uncached data requests. + HBM Read: The total number of L2 requests to Infinity Fabric to read 32B or 64B + of data from the accelerator's local HBM, per normalization unit. + Remote Read: The total number of L2 requests to Infinity Fabric to read 32B or + 64B of data from any source other than the accelerator's local HBM, per normalization + unit. + Write and Atomic (32B): The total number of L2 requests to Infinity Fabric to + write or atomically update 32B of data to any memory location, per normalization + unit. + Write and Atomic (Uncached): The total number of L2 requests to Infinity Fabric + to write or atomically update 32B or 64B of uncached data, per normalization + unit. + Write and Atomic (64B): The total number of L2 requests to Infinity Fabric to + write or atomically update 64B of data in any memory location, per normalization + unit. + HBM Write and Atomic: The total number of L2 requests to Infinity Fabric to write + or atomically update 32B or 64B of data in the accelerator's local HBM, per + normalization unit. + Remote Write and Atomic: The total number of L2 requests to Infinity Fabric to + write or atomically update 32B or 64B of data in any memory location other than + the accelerator's local HBM, per normalization unit. + Atomic: The total number of L2 requests to Infinity Fabric to atomically update + 32B or 64B of data in any memory location, per normalization unit. See Request + flow for more detail. Note that on current CDNA accelerators, such as the MI2XX, + requests are only considered atomic by Infinity Fabric if they are targeted + at non-write-cacheable memory, such as fine-grained memory allocations or uncached + memory allocations on the MI2XX. + Read Stall: "The ratio of the total number of cycles the L2-Fabric interface was\ + \ stalled on a read request to any destination (local HBM, remote PCIe\xAE connected\ + \ accelerator or CPU, or remote Infinity Fabric connected accelerator or CPU)\ + \ over the total active L2 cycles." + Write Stall: The ratio of the total number of cycles the L2-Fabric interface was + stalled on a write or atomic request to any destination (local HBM, remote accelerator + or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected + accelerator or CPU) over the total active L2 cycles. + Read - PCIe Stall: The number of cycles the L2-Fabric interface was stalled on + read requests to remote PCIe connected accelerators or CPUs as a percent of + the total active L2 cycles. + Read - Infinity Fabric Stall: The number of cycles the L2-Fabric interface was + stalled on read requests to remote Infinity Fabric connected accelerators or + CPUs as a percent of the total active L2 cycles. + Read - HBM Stall: The number of cycles the L2-Fabric interface was stalled on + read requests to the accelerator's local HBM as a percent of the total active + L2 cycles. + Write - PCIe Stall: The number of cycles the L2-Fabric interface was stalled on + write or atomic requests to remote PCIe connected accelerators or CPUs as a + percent of the total active L2 cycles. + Write - Infinity Fabric Stall: The number of cycles the L2-Fabric interface was + stalled on write or atomic requests to remote Infinity Fabric connected accelerators + or CPUs as a percent of the total active L2 cycles. + Write - HBM Stall: The number of cycles the L2-Fabric interface was stalled on + write or atomic requests to accelerator's local HBM as a percent of the total + active L2 cycles. + data source: + - metric_table: + id: 1701 + title: L2 Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + metric: + Utilization: + value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD))) + unit: pct + Peak Bandwidth: + value: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) + / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) + unit: pct + Hit Rate: + value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else 0)) + unit: pct + L2-Fabric Read BW: + value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum + - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp + - Start_Timestamp)) + unit: GB/s + L2-Fabric Write and Atomic BW: + value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + HBM Bandwidth: + value: $hbmBandwidth + unit: GB/s + - metric_table: + id: 1702 + title: L2-Fabric interface metrics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Read BW: + avg: AVG(((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum + - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / $denom)) + min: MIN(((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum + - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / $denom)) + max: MAX(((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum + - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / $denom)) + unit: (Bytes + $normUnit) + HBM Read Traffic: + avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + unit: pct + Remote Read Traffic: + avg: AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) / TCC_EA0_RDREQ_sum) + if (TCC_EA0_RDREQ_sum != 0) else None)) + min: MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) / TCC_EA0_RDREQ_sum) + if (TCC_EA0_RDREQ_sum != 0) else None)) + max: MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) / TCC_EA0_RDREQ_sum) + if (TCC_EA0_RDREQ_sum != 0) else None)) + unit: pct + Uncached Read Traffic: + avg: AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + unit: pct + Write and Atomic BW: + avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + unit: (Bytes + $normUnit) + HBM Write and Atomic Traffic: + avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: pct + Remote Write and Atomic Traffic: + avg: AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) + if (TCC_EA0_WRREQ_sum != 0) else None)) + min: MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) + if (TCC_EA0_WRREQ_sum != 0) else None)) + max: MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) + if (TCC_EA0_WRREQ_sum != 0) else None)) + unit: pct + Atomic Traffic: + avg: AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: pct + Uncached Write and Atomic Traffic: + avg: AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: pct + Read Latency: + avg: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + min: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + max: MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + unit: Cycles + Write and Atomic Latency: + avg: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + min: MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + max: MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: Cycles + Atomic Latency: + avg: AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else None)) + min: MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else None)) + max: MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else None)) + unit: Cycles + - metric_table: + id: 1703 + title: L2 Cache Accesses + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Bandwidth: + avg: AVG((TCC_REQ_sum * 128) / $denom) + min: MIN((TCC_REQ_sum * 128) / $denom) + max: MAX((TCC_REQ_sum * 128) / $denom) + unit: (Bytes + $normUnit) + Req: + avg: AVG((TCC_REQ_sum / $denom)) + min: MIN((TCC_REQ_sum / $denom)) + max: MAX((TCC_REQ_sum / $denom)) + unit: (Req + $normUnit) + Read Req: + avg: AVG((TCC_READ_sum / $denom)) + min: MIN((TCC_READ_sum / $denom)) + max: MAX((TCC_READ_sum / $denom)) + unit: (Req + $normUnit) + Write Req: + avg: AVG((TCC_WRITE_sum / $denom)) + min: MIN((TCC_WRITE_sum / $denom)) + max: MAX((TCC_WRITE_sum / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG((TCC_ATOMIC_sum / $denom)) + min: MIN((TCC_ATOMIC_sum / $denom)) + max: MAX((TCC_ATOMIC_sum / $denom)) + unit: (Req + $normUnit) + Streaming Req: + avg: AVG((TCC_STREAMING_REQ_sum / $denom)) + min: MIN((TCC_STREAMING_REQ_sum / $denom)) + max: MAX((TCC_STREAMING_REQ_sum / $denom)) + unit: (Req + $normUnit) + Probe Req: + avg: AVG((TCC_PROBE_sum / $denom)) + min: MIN((TCC_PROBE_sum / $denom)) + max: MAX((TCC_PROBE_sum / $denom)) + unit: (Req + $normUnit) + Cache Hit: + avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + unit: pct + Hits: + avg: AVG((TCC_HIT_sum / $denom)) + min: MIN((TCC_HIT_sum / $denom)) + max: MAX((TCC_HIT_sum / $denom)) + unit: (Hits + $normUnit) + Misses: + avg: AVG((TCC_MISS_sum / $denom)) + min: MIN((TCC_MISS_sum / $denom)) + max: MAX((TCC_MISS_sum / $denom)) + unit: (Misses + $normUnit) + Writeback: + avg: AVG((TCC_WRITEBACK_sum / $denom)) + min: MIN((TCC_WRITEBACK_sum / $denom)) + max: MAX((TCC_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + Writeback (Internal): + avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom)) + min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom)) + max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + Writeback (vL1D Req): + avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + Evict (Internal): + avg: AVG((TCC_NORMAL_EVICT_sum / $denom)) + min: MIN((TCC_NORMAL_EVICT_sum / $denom)) + max: MAX((TCC_NORMAL_EVICT_sum / $denom)) + unit: (Cachelines + $normUnit) + Evict (vL1D Req): + avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + unit: (Cachelines + $normUnit) + NC Req: + avg: AVG((TCC_NC_REQ_sum / $denom)) + min: MIN((TCC_NC_REQ_sum / $denom)) + max: MAX((TCC_NC_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC Req: + avg: AVG((TCC_UC_REQ_sum / $denom)) + min: MIN((TCC_UC_REQ_sum / $denom)) + max: MAX((TCC_UC_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC Req: + avg: AVG((TCC_CC_REQ_sum / $denom)) + min: MIN((TCC_CC_REQ_sum / $denom)) + max: MAX((TCC_CC_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW Req: + avg: AVG((TCC_RW_REQ_sum / $denom)) + min: MIN((TCC_RW_REQ_sum / $denom)) + max: MAX((TCC_RW_REQ_sum / $denom)) + unit: (Req + $normUnit) + - metric_table: + id: 1704 + title: L2 Cache Stalls + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: {} + - metric_table: + id: 1705 + title: L2 - Fabric Interface stalls + header: + metric: Metric + type: Type + transaction: Transaction + avg: Avg + min: Min + max: Max + unit: Unit + style: + type: simple_multi_bar + metric: + Write - Credit Starvation: + type: Credit Starvation + transaction: Write + avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum + != 0) else None)) + min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum + != 0) else None)) + max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum + != 0) else None)) + unit: pct + - metric_table: + id: 1706 + title: L2 - Fabric interface detailed metrics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Read (32B): + avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom)) + min: MIN((TCC_EA0_RDREQ_32B_sum / $denom)) + max: MAX((TCC_EA0_RDREQ_32B_sum / $denom)) + unit: (Req + $normUnit) + Read (64B): + avg: AVG(((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) / + $denom)) + min: MIN(((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) / + $denom)) + max: MAX(((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) / + $denom)) + unit: (Req + $normUnit) + Read (128B): + avg: AVG(((TCC_BUBBLE_sum) / $denom)) + min: MIN(((TCC_BUBBLE_sum) / $denom)) + max: MAX(((TCC_BUBBLE_sum) / $denom)) + unit: (Req + $normUnit) + Read (Uncached): + avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) + max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) + unit: (Req + $normUnit) + HBM Read: + avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)) + min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom)) + max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom)) + unit: (Req + $normUnit) + Remote Read: + avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) + min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) + max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) + unit: (Req + $normUnit) + Write and Atomic (32B): + avg: AVG(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) + min: MIN(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) + max: MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) + unit: (Req + $normUnit) + Write and Atomic (Uncached): + avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) + max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) + unit: (Req + $normUnit) + Write and Atomic (64B): + avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom)) + min: MIN((TCC_EA0_WRREQ_64B_sum / $denom)) + max: MAX((TCC_EA0_WRREQ_64B_sum / $denom)) + unit: (Req + $normUnit) + HBM Write and Atomic: + avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)) + min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom)) + max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom)) + unit: (Req + $normUnit) + Remote Write and Atomic: + avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) + min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) + max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) + unit: (Req + $normUnit) + Atomic: + avg: AVG((TCC_EA0_ATOMIC_sum / $denom)) + min: MIN((TCC_EA0_ATOMIC_sum / $denom)) + max: MAX((TCC_EA0_ATOMIC_sum / $denom)) + unit: (Req + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1800_L2_cache_per_channel.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1800_L2_cache_per_channel.yaml deleted file mode 100644 index 1f1ba67ab5..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1800_L2_cache_per_channel.yaml +++ /dev/null @@ -1,308 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1800 - title: L2 Cache (per Channel) - data source: - - metric_table: - id: 1801 - title: Aggregate Stats (All channels) - header: - metric: Metric - avg: Avg - std dev: Std Dev - min: Min - max: Max - unit: Unit - tips: Tips - metric: - L2 Cache Hit Rate: - avg: AVG(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) - + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * - TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) - + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 - * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) - / (((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) != 0) else None) - std dev: STD(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) - + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * - TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) - + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 - * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) - / (((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) != 0) else None) - min: MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) - + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * - TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) - + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 - * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) - / (((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) != 0) else None) - max: MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) - + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * - TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) - + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 - * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) - / (((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) != 0) else None) - unit: pct - tips: - # FIXME: other arggr metrics!! - - - metric_table: - id: 1802 - title: L2 Cache Hit Rate (pct) - header: - metric: Channel - expr: Expression - metric: - "::_1": - expr: - (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1] - + TCC_MISS[::_1]) != 0) else None) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_box - tui_style: simple_box - - - metric_table: - id: 1803 - title: L2 Requests (per normUnit) - header: - metric: Channel - expr: Expression - metric: - "::_1": - expr: (TO_INT(TCC_REQ[::_1]) / $denom) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_box - tui_style: simple_box - - - metric_table: - id: 1804 - title: L2 Requests (per normUnit) - header: - metric: Channel - read req: L2 Read - write req: L2 Write - atomic req: L2 Atomic - metric: - "::_1": - read req: AVG((TO_INT(TCC_READ[::_1]) / $denom)) - write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom)) - atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom)) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_multiple_bar - tui_style: simple_multiple_bar - - - metric_table: - id: 1805 - title: L2-Fabric Requests (per normUnit) - header: - metric: Channel - read req: L2-Fabric Read - write req: L2-Fabric Write and Atomic - atomic req: L2-Fabric Atomic - metric: - "::_1": - read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom)) - write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom)) - atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom)) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_multiple_bar - tui_style: simple_multiple_bar - - # - metric_table: - # id: 1806 - # title: L2-Fabric Latency (Cycles) - # header: - # metric: Metric - # read lat: L2-Fabric Read - # write lat: L2-Fabric Write - # atomic lat: L2-Fabric Atomic - # metric: - # "::_1": - # read lat: - # AVG(((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1] - # != 0) else None)) - # write lat: - # AVG(((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1] - # != 0) else None)) - # atomic lat: - # AVG(((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if - # (TCC_EA0_ATOMIC[::_1] != 0) else 0)) - # placeholder_range: - # "::_1": $total_l2_chan - # cli_style: simple_multiple_bar - - - metric_table: - id: 1806 - title: L2-Fabric Read Latency (Cycles) - header: - metric: Channel - expr: Expression - metric: - "::_1": - expr: - ((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1] - != 0) else None) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_box - tui_style: simple_box - - - metric_table: - id: 1807 - title: L2-Fabric Write and Atomic Latency (Cycles) - header: - metric: Channel - expr: Expression - metric: - "::_1": - expr: - ((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1] - != 0) else None) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_box - tui_style: simple_box - - - metric_table: - id: 1808 - title: L2-Fabric Atomic Latency (Cycles) - header: - metric: Channel - expr: Expression - metric: - "::_1": - expr: ((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if - (TCC_EA0_ATOMIC[::_1] != 0) else 0) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_box - tui_style: simple_box - - - metric_table: - id: 1809 - title: L2-Fabric Read Stall (Cycles per normUnit) - header: - metric: Channel - ea read stall - pcie: L2-Fabric Read Stall (PCIe) - ea read stall - if: L2-Fabric Read Stall (Infinity Fabric™) - ea read stall - hbm: L2-Fabric Read Stall (HBM) - metric: - "::_1": - ea read stall - pcie: None # Missing perfmon - ea read stall - if: None # Missing perfmon - ea read stall - hbm: None # Missing perfmon - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_multiple_bar - tui_style: simple_multiple_bar - - - metric_table: - id: 1810 - title: L2-Fabric Write and Atomic Stall (Cycles per normUnit) - header: - metric: Channel - ea write stall - pcie: L2-Fabric Write Stall (PCIe) - ea write stall - gmi: L2-Fabric Write Stall (Infinity Fabric™) - ea write stall - dram: L2-Fabric Write Stall (HBM) - ea write stall - starve: L2-Fabric Write Starve - metric: - "::_1": - ea write stall - pcie: None # Missing perfmon - ea write stall - if: None # Missing perfmon - ea write stall - hbm: None # Missing perfmon - ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1]) / $denom)) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_multiple_bar - tui_style: simple_multiple_bar - - # - metric_table: - # id: 1811 - # title: L2 Tag Stall (cycles) - # header: - # metric: Metric - # expr: Expression - # metric: - # "::_1": - # expr: TCC_TAG_STALL[::_1] - # placeholder_range: - # "::_1": $total_l2_chan - # cli_style: simple_box - - - metric_table: - id: 1812 - title: L2-Fabric (128B read requests per normUnit) - header: - metric: Channel - expr: Expression - metric: - "::_1": - expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom) - placeholder_range: - "::_1": $total_l2_chan - # tips: Number of 128-byte read requests sent to EA - cli_style: simple_box - tui_style: simple_box diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1800_l2_cache_per_channel.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1800_l2_cache_per_channel.yaml new file mode 100644 index 0000000000..849662871e --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1800_l2_cache_per_channel.yaml @@ -0,0 +1,251 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1800 + title: L2 Cache (per Channel) + metrics_description: + L2 Cache Hit Rate: The percent of total number of requests to the L2 from all + clients that hit in the cache. As noted in the Speed-of-Light section, this + includes hit-on-miss requests. + data source: + - metric_table: + id: 1801 + title: Aggregate Stats (All channels) + header: + metric: Metric + avg: Avg + std dev: Std Dev + min: Min + max: Max + unit: Unit + metric: + L2 Cache Hit Rate: + avg: AVG(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 + * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 + * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * + TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) + / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) != 0) else None) + std dev: STD(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 + * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 + * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * + TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) + / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) != 0) else None) + min: MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 + * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 + * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * + TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) + / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) != 0) else None) + max: MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 + * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 + * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * + TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) + / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) != 0) else None) + unit: pct + - metric_table: + id: 1802 + title: L2 Cache Hit Rate (pct) + header: + metric: Channel + expr: Expression + metric: + ::_1: + expr: (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1] + + TCC_MISS[::_1]) != 0) else None) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1803 + title: L2 Requests (per normUnit) + header: + metric: Channel + expr: Expression + metric: + ::_1: + expr: (TO_INT(TCC_REQ[::_1]) / $denom) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1804 + title: L2 Requests (per normUnit) + header: + metric: Channel + read req: L2 Read + write req: L2 Write + atomic req: L2 Atomic + metric: + ::_1: + read req: AVG((TO_INT(TCC_READ[::_1]) / $denom)) + write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom)) + atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom)) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_multiple_bar + tui_style: simple_multiple_bar + - metric_table: + id: 1805 + title: L2-Fabric Requests (per normUnit) + header: + metric: Channel + read req: L2-Fabric Read + write req: L2-Fabric Write and Atomic + atomic req: L2-Fabric Atomic + metric: + ::_1: + read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom)) + write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom)) + atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom)) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_multiple_bar + tui_style: simple_multiple_bar + - metric_table: + id: 1806 + title: L2-Fabric Read Latency (Cycles) + header: + metric: Channel + expr: Expression + metric: + ::_1: + expr: ((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1] + != 0) else None) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1807 + title: L2-Fabric Write and Atomic Latency (Cycles) + header: + metric: Channel + expr: Expression + metric: + ::_1: + expr: ((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1] + != 0) else None) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1808 + title: L2-Fabric Atomic Latency (Cycles) + header: + metric: Channel + expr: Expression + metric: + ::_1: + expr: ((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if (TCC_EA0_ATOMIC[::_1] + != 0) else 0) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1809 + title: L2-Fabric Read Stall (Cycles per normUnit) + header: + metric: Channel + ea read stall - pcie: L2-Fabric Read Stall (PCIe) + ea read stall - if: "L2-Fabric Read Stall (Infinity Fabric\u2122)" + ea read stall - hbm: L2-Fabric Read Stall (HBM) + metric: + ::_1: + ea read stall - pcie: None + ea read stall - if: None + ea read stall - hbm: None + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_multiple_bar + tui_style: simple_multiple_bar + - metric_table: + id: 1810 + title: L2-Fabric Write and Atomic Stall (Cycles per normUnit) + header: + metric: Channel + ea write stall - pcie: L2-Fabric Write Stall (PCIe) + ea write stall - if: "L2-Fabric Write Stall (Infinity Fabric\u2122)" + ea write stall - hbm: L2-Fabric Write Stall (HBM) + ea write stall - starve: L2-Fabric Write Starve + metric: + ::_1: + ea write stall - pcie: None + ea write stall - if: None + ea write stall - hbm: None + ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1]) + / $denom)) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_multiple_bar + tui_style: simple_multiple_bar + - metric_table: + id: 1812 + title: L2-Fabric (128B read requests per normUnit) + header: + metric: Channel + expr: Expression + metric: + ::_1: + expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/2100_pc_sampling.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/2100_pc_sampling.yaml index d6c4ff393d..e94471d7dc 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/2100_pc_sampling.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/2100_pc_sampling.yaml @@ -1,10 +1,11 @@ ---- +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py Panel Config: id: 2100 title: PC Sampling + metrics_description: {} data source: - - pc_sampling_table: - id: 2101 - title: PC Sampling - source: ps_file - comparable: false # enable it later + - pc_sampling_table: + id: 2101 + title: PC Sampling + source: ps_file + comparable: false diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0000_top_stats.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0000_top_stats.yaml index ccf1309850..55c6f6bb24 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0000_top_stats.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0000_top_stats.yaml @@ -1,14 +1,14 @@ ---- +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py Panel Config: - id: 000 + id: 0 title: Top Stats + metrics_description: {} data source: - - raw_csv_table: - id: 001 - title: Top Kernels - source: pmc_kernel_top.csv - - - raw_csv_table: - id: 002 - title: Dispatch List - source: pmc_dispatch_info.csv + - raw_csv_table: + id: 1 + title: Top Kernels + source: pmc_kernel_top.csv + - raw_csv_table: + id: 2 + title: Dispatch List + source: pmc_dispatch_info.csv diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0100_system_info.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0100_system_info.yaml index b7ec29eaf9..8470ffbbe3 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0100_system_info.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0100_system_info.yaml @@ -1,9 +1,10 @@ ---- +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py Panel Config: id: 100 title: System Info + metrics_description: {} data source: - - raw_csv_table: - id: 101 - source: sysinfo.csv - columnwise: True + - raw_csv_table: + id: 101 + source: sysinfo.csv + columnwise: true diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0200_system-speed-of-light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0200_system-speed-of-light.yaml deleted file mode 100644 index 13fb4f0ebb..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0200_system-speed-of-light.yaml +++ /dev/null @@ -1,269 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - SALU: &SALU_anchor Scalar Arithmetic Logic Unit - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 200 - title: System Speed-of-Light - data source: - - metric_table: - id: 201 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - peak: Peak - pop: Pct of Peak - tips: Tips - metric: - VALU FLOPs: - value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) - + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) - + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + (64 * (((SQ_INSTS_VALU_ADD_F64 - + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (2 * SQ_INSTS_VALU_FMA_F64)))) - / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) / (((($max_sclk - * $cu_per_gpu) * 64) * 2) / 1000)) - tips: - VALU IOPs: - value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - Start_Timestamp))) - unit: GIOP/s - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) - tips: - MFMA FLOPs (F8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) - tips: - MFMA FLOPs (BF16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - tips: - MFMA FLOPs (F16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - tips: - MFMA FLOPs (F32): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - tips: - MFMA FLOPs (F64): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 128) / 1000)) - tips: - MFMA FLOPs (F6F4): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 16834) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 16834) / 1000)) - tips: - MFMA IOPs (Int8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GIOP/s - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) - tips: - Active CUs: - value: $numActiveCUs - unit: CUs - peak: $cu_per_gpu - pop: ((100 * $numActiveCUs) / $cu_per_gpu) - tips: - SALU Utilization: - value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - peak: 100 - pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - tips: - VALU Utilization: - value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - peak: 100 - pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - tips: - MFMA Utilization: - value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu) - * 4))) - unit: pct - peak: 100 - pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu) - * 4))) - tips: - VMEM Utilization: - value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - peak: 100 - pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - tips: - Branch Utilization: - value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - peak: 100 - pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - tips: - VALU Active Threads: - value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - unit: Threads - peak: $wave_size - pop: (100 * AVG((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU / $wave_size) if (SQ_ACTIVE_INST_VALU != 0) else None)) - tips: - IPC: - value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - unit: Instr/cycle - peak: 5 - pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5) - tips: - Wavefront Occupancy: - value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - unit: Wavefronts - peak: ($max_waves_per_cu * $cu_per_gpu) - pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu - * $cu_per_gpu)))) - coll_level: SQ_LEVEL_WAVES - tips: - Theoretical LDS Bandwidth: - value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: (($max_sclk * $cu_per_gpu) * 0.128) - pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * 0.00128))) - tips: - LDS Bank Conflicts/Access: - value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - unit: Conflicts/access - peak: 32 - pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) / 32) - tips: - vL1D Cache Hit Rate: - value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - unit: pct - peak: 100 - pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / - TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - tips: - vL1D Cache BW: - value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu) - pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) - tips: - L2 Cache Hit Rate: - value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - unit: pct - peak: 100 - pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - tips: - L2 Cache BW: - value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)) - pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) - tips: - L2-Fabric Read BW: - value: AVG((128 * TCC_BUBBLE_sum + - 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + - 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)) - unit: GB/s - peak: $hbmBandwidth - pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + - 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + - 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth) - tips: - L2-Fabric Write BW: - value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: $hbmBandwidth - pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth) - tips: - L2-Fabric Read Latency: - value: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - unit: Cycles - peak: None - pop: None - tips: - L2-Fabric Write Latency: - value: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - unit: Cycles - peak: None - pop: None - tips: - sL1D Cache Hit Rate: - value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) - if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) - unit: pct - peak: 100 - pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) - if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) - tips: - sL1D Cache BW: - value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) - unit: GB/s - peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) - pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))) / ((($max_sclk - / 1000) * 64) * $sqc_per_gpu)) - tips: - L1I Hit Rate: - value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) - unit: pct - peak: 100 - pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) - tips: - L1I BW: - value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) - unit: GB/s - peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) - pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))) / ((($max_sclk - / 1000) * 64) * $sqc_per_gpu)) - tips: - L1I Fetch Latency: - value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - unit: Cycles - peak: None - pop: None - coll_level: SQ_IFETCH_LEVEL - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0200_system_speed_of_light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0200_system_speed_of_light.yaml new file mode 100644 index 0000000000..84327d65ea --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0200_system_speed_of_light.yaml @@ -0,0 +1,352 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 200 + title: System Speed-of-Light + metrics_description: + VALU FLOPs: 'The total floating-point operations executed per second on the VALU. + This is also presented as a percent of the peak theoretical FLOPs achievable + on the specific accelerator. Note: this does not include any floating-point + operations from MFMA instructions.' + VALU IOPs: 'The total integer operations executed per second on the VALU. This + is also presented as a percent of the peak theoretical IOPs achievable on the + specific accelerator. Note: this does not include any integer operations from + MFMA instructions.' + MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations + executed per second. This does not include any 16-bit brain floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F8 MFMA operations achievable on the specific accelerator. It is supported on + AMD Instinct MI300 series and later only. + MFMA FLOPs (BF16): 'The total number of 16-bit brain floating point MFMA operations + executed per second. Note: this does not include any 16-bit brain floating point + operations from VALU instructions. This is also presented as a percent of the + peak theoretical BF16 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F16): 'The total number of 16-bit floating point MFMA operations executed + per second. Note: this does not include any 16-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F16 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F32): 'The total number of 32-bit floating point MFMA operations executed + per second. Note: this does not include any 32-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F32 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F64): 'The total number of 64-bit floating point MFMA operations executed + per second. Note: this does not include any 64-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F64 MFMA operations achievable on the specific accelerator.' + MFMA IOPs (Int8): 'The total number of 8-bit integer MFMA operations executed + per second. Note: this does not include any 8-bit integer operations from VALU + instructions. This is also presented as a percent of the peak theoretical INT8 + MFMA operations achievable on the specific accelerator.' + Active CUs: Total number of active compute units (CUs) on the accelerator during + the kernel execution. + SALU Utilization: Indicates what percent of the kernel's duration the SALU was + busy executing instructions. Computed as the ratio of the total number of cycles + spent by the scheduler issuing SALU or SMEM instructions over the total CU cycles. + VALU Utilization: Indicates what percent of the kernel's duration the VALU was + busy executing instructions. Does not include VMEM operations. Computed as the + ratio of the total number of cycles spent by the scheduler issuing VALU instructions + over the total CU cycles. + MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit + was busy executing instructions. Computed as the ratio of the total number of + cycles the MFMA was busy over the total CU cycles. + VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit + was busy executing instructions, including both global/generic and spill/scratch + operations (see the VMEM instruction count metrics) for more detail). Does not + include VALU operations. Computed as the ratio of the total number of cycles + spent by the scheduler issuing VMEM instructions over the total CU cycles. + Branch Utilization: Indicates what percent of the kernel's duration the branch + unit was busy executing instructions. Computed as the ratio of the total number + of cycles spent by the scheduler issuing branch instructions over the total + CU cycles + VALU Active Threads: Indicates the average level of divergence within a wavefront + over the lifetime of the kernel. The number of work-items that were active in + a wavefront during execution of each VALU instruction, time-averaged over all + VALU instructions run on all wavefronts in the kernel. + IPC: The ratio of the total number of instructions executed on the CU over the + total active CU cycles. This is also presented as a percent of the peak theoretical + bandwidth achievable on the specific accelerator. + Wavefront Occupancy: 'The time-averaged number of wavefronts resident on the accelerator + over the lifetime of the kernel. Note: this metric may be inaccurate for short-running + kernels (less than 1ms). This is also presented as a percent of the peak theoretical + occupancy achievable on the specific accelerator.' + Theoretical LDS Bandwidth: Indicates the maximum amount of bytes that could have + been loaded from, stored to, or atomically updated in the LDS per unit time + (see LDS Bandwidth example for more detail). This is also presented as a percent + of the peak theoretical F64 MFMA operations achievable on the specific accelerator. + LDS Bank Conflicts/Access: The ratio of the number of cycles spent in the LDS + scheduler due to bank conflicts (as determined by the conflict resolution hardware) + to the base number of cycles that would be spent in the LDS scheduler in a completely + uncontended case. This is also presented in normalized form (i.e., the Bank + Conflict Rate). + vL1D Cache Hit Rate: The ratio of the number of vL1D cache line requests that + hit in vL1D cache over the total number of cache line requests to the vL1D cache + RAM. + vL1D Cache BW: The number of bytes looked up in the vL1D cache as a result of + VMEM instructions per unit time. The number of bytes is calculated as the number + of cache lines requested multiplied by the cache line size. This value does + not consider partial requests, so e.g., if only a single value is requested + in a cache line, the data movement will still be counted as a full cache line. + This is also presented as a percent of the peak theoretical bandwidth achievable + on the specific accelerator. + L2 Cache Hit Rate: The ratio of the number of L2 cache line requests that hit + in the L2 cache over the total number of incoming cache line requests to the + L2 cache. + L2 Cache BW: The number of bytes looked up in the L2 cache per unit time. The + number of bytes is calculated as the number of cache lines requested multiplied + by the cache line size. This value does not consider partial requests, so e.g., + if only a single value is requested in a cache line, the data movement will + still be counted as a full cache line. This is also presented as a percent of + the peak theoretical bandwidth achievable on the specific accelerator. + L2-Fabric Read BW: "The number of bytes read by the L2 over the Infinity Fabric\u2122\ + \ interface per unit time. This is also presented as a percent of the peak theoretical\ + \ bandwidth achievable on the specific accelerator." + L2-Fabric Write BW: The number of bytes sent by the L2 over the Infinity Fabric + interface by write and atomic operations per unit time. This is also presented + as a percent of the peak theoretical bandwidth achievable on the specific accelerator. + L2-Fabric Read Latency: The time-averaged number of cycles read requests spent + in Infinity Fabric before data was returned to the L2. + L2-Fabric Write Latency: The time-averaged number of cycles write requests spent + in Infinity Fabric before a completion acknowledgement was returned to the L2. + sL1D Cache Hit Rate: The percent of sL1D requests that hit on a previously loaded + line the cache. Calculated as the ratio of the number of sL1D requests that + hit over the number of all sL1D requests. + sL1D Cache BW: The number of bytes looked up in the sL1D cache per unit time. + This is also presented as a percent of the peak theoretical bandwidth achievable + on the specific accelerator. + L1I Hit Rate: The number of bytes looked up in the L1I cache per unit time. This + is also presented as a percent of the peak theoretical bandwidth achievable + on the specific accelerator. + L1I BW: The percent of L1I requests that hit on a previously loaded line the cache. + Calculated as the ratio of the number of L1I requests that hit over the number + of all L1I requests. + L1I Fetch Latency: The average number of cycles spent to fetch instructions to + a CU. + data source: + - metric_table: + id: 201 + title: System Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + peak: Peak + pop: Pct of Peak + metric: + VALU FLOPs: + value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) + / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) + VALU IOPs: + value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp + - Start_Timestamp))) + unit: GIOP/s + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp + - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) + MFMA FLOPs (F8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) + MFMA FLOPs (BF16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + MFMA FLOPs (F16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + MFMA FLOPs (F32): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + MFMA FLOPs (F64): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000)) + MFMA FLOPs (F6F4): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 16834) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 16834) / 1000)) + MFMA IOPs (Int8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GIOP/s + peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) + Active CUs: + value: $numActiveCUs + unit: CUs + peak: $cu_per_gpu + pop: ((100 * $numActiveCUs) / $cu_per_gpu) + SALU Utilization: + value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + VALU Utilization: + value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + MFMA Utilization: + value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu) * 4))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu) * 4))) + VMEM Utilization: + value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + unit: pct + peak: 100 + pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + Branch Utilization: + value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + peak: 100 + pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + VALU Active Threads: + value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + unit: Threads + peak: $wave_size + pop: (100 * AVG((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU / $wave_size) + if (SQ_ACTIVE_INST_VALU != 0) else None)) + IPC: + value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + unit: Instr/cycle + peak: 5 + pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5) + Wavefront Occupancy: + value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + unit: Wavefronts + peak: ($max_waves_per_cu * $cu_per_gpu) + pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu + * $cu_per_gpu)))) + coll_level: SQ_LEVEL_WAVES + Theoretical LDS Bandwidth: + value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: (($max_sclk * $cu_per_gpu) * 0.128) + pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * 0.00128))) + LDS Bank Conflicts/Access: + value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + unit: Conflicts/access + peak: 32 + pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) / 32) + vL1D Cache Hit Rate: + value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None)) + unit: pct + peak: 100 + pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None)) + vL1D Cache BW: + value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu) + pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) + L2 Cache Hit Rate: + value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + unit: pct + peak: 100 + pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + L2 Cache BW: + value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)) + pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) + / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) + L2-Fabric Read BW: + value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum + - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp + - Start_Timestamp)) + unit: GB/s + peak: $hbmBandwidth + pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum + - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp + - Start_Timestamp)))) / $hbmBandwidth) + L2-Fabric Write BW: + value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: $hbmBandwidth + pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - + TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))) / + $hbmBandwidth) + L2-Fabric Read Latency: + value: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + unit: Cycles + peak: None + pop: None + L2-Fabric Write Latency: + value: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: Cycles + peak: None + pop: None + sL1D Cache Hit Rate: + value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) + if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) + unit: pct + peak: 100 + pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) + if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) + sL1D Cache BW: + value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) + unit: GB/s + peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) + pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * + 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) + L1I Hit Rate: + value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) + unit: pct + peak: 100 + pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) + L1I BW: + value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) + unit: GB/s + peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) + pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * + 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) + L1I Fetch Latency: + value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + unit: Cycles + peak: None + pop: None + coll_level: SQ_IFETCH_LEVEL diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0300_mem_chart.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0300_mem_chart.yaml deleted file mode 100644 index 0e4ff7d059..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0300_mem_chart.yaml +++ /dev/null @@ -1,315 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 300 - title: Memory Chart - data source: - - metric_table: - id: 301 - title: Memory Chart - header: - metric: Metric - #alias: #alias - value: Value - tips: Tips - metric: - # ---------------------------------------- - # Instr Buff Block - - #TODO: double check wave_occupancy - Wavefront Occupancy: - #alias: wave_occ_ - value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs), 0) - coll_level: SQ_LEVEL_WAVES - tips: - Wave Life: - #alias: wave_life_ - value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0) else 0)), 0) - tips: - - # ---------------------------------------- - # Instr Dispatch Block - SALU: - #alias: salu_ - value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0) - tips: - SMEM: - #alias: smem_ - value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0) - tips: - VALU: - #alias: valu_ - value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0) - tips: - MFMA: - #alias: mfma_ - value: ROUND(AVG((SQ_INSTS_MFMA / $denom)), 0) - tips: - VMEM: - #alias: vmem_ - value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0) - tips: - LDS: - #alias: lds_ - value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) - tips: - GWS: - #alias: gws_ - value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0) - tips: - BR: - #alias: br_ - value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0) - tips: - - # ---------------------------------------- - # Exec Block - Active CUs: - #alias: active_cu_ - value: $numActiveCUs - tips: - Num CUs: - #alias: num_cu_ - value: $cu_per_gpu - tips: - VGPR: - #alias: vgpr_ - value: ROUND(AVG(Arch_VGPR), 0) - tips: - # Todo: add AGPRs - SGPR: - #alias: sgpr_ - value: ROUND(AVG(SGPR), 0) - tips: - LDS Allocation: - #alias: lds_alloc_ - value: ROUND(AVG(LDS_Per_Workgroup), 0) - tips: - Scratch Allocation: - #alias: scratch_alloc_ - value: ROUND(AVG(Scratch_Per_Workitem), 0) - tips: - Wavefronts: - #alias: wavefronts_ - value: ROUND(AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE), 0) - tips: - Workgroups: - #alias: workgroups_ - value: ROUND(AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS), 0) - tips: - - # ---------------------------------------- - # LDS Block - LDS Req: - #alias: lds_req_ - value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) - tips: - LDS Util: - #alias: lds_util_ - value: - ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))), - 0) - tips: - LDS Latency: - #alias: lds_lat - value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)),0) - coll_level: SQ_INST_LEVEL_LDS - tips: - - # ---------------------------------------- - # Vector L1 Cache Block - VL1 Rd: - #alias: vl1_rd_ - value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0) - tips: - VL1 Wr: - #alias: vl1_wr_ - value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0) - tips: - VL1 Atomic: - #alias: vl1_atom_ - value: - ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)), 0) - tips: - - VL1 Hit: - #alias: vl1_hit_ - value: - ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None )), 0) - tips: - VL1 Lat: - #alias: vl1_lat_ - value: - ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum - != 0) else None)), 0) - tips: - VL1 Coalesce: - #alias: vl1_coales_ - value: - ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum - * 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0) - tips: - VL1 Stall: - #alias: vl1_stall_ - value: - ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None)), 0) - tips: - - VL1_L2 Rd: - #alias: vl1_l2_rd_ - value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0) - tips: - VL1_L2 Wr: - #alias: vl1_l2_wr_ - value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0) - tips: - VL1_L2 Atomic: - #alias: vl1_l2_atom_ - value: - ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)), 0) - tips: - - # ---------------------------------------- - # Scalar L1D Cache Block - VL1D Rd: - #alias: sl1_rd_ - value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0) - tips: - VL1D Hit: - #alias: sl1_hit_ - value: - ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ != - 0) else None)) * 100), 0) - tips: - VL1D Lat: - #alias: sl1_lat_ - value: - ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ != - 0) else None)) * 100), 0) - coll_level: SQC_DCACHE_INFLIGHT_LEVEL - tips: - - VL1D_L2 Rd: - #alias: sl1_l2_rd_ - value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0) - tips: - VL1D_L2 Wr: - #alias: sl1_l2_wr_ - value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0) - tips: - VL1D_L2 Atomic: - #alias: sl1_l2_atom_ - value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0) - tips: - - # ---------------------------------------- - # Instr L1 Cache Block - IL1 Fetch: - #alias: il1_fetch_ - value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0) - tips: - IL1 Hit: - #alias: il1_hit_ - value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0) - tips: - IL1 Lat: - #alias: il1_lat_ - value: - ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ != - 0) else None)) * 100), 0) - tips: # ??? coll_level: SQ_IFETCH_LEVEL - IL1_L2 Rd: - #alias: il1_l2_req_ - value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0) - tips: - - # ---------------------------------------- - # L2 Cache Block(inside) - L2 Rd: - #alias: l2_rd_ - value: ROUND(AVG((TCC_READ_sum / $denom)), 0) - tips: - L2 Wr: - #alias: l2_wr_ - value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0) - tips: - L2 Atomic: - #alias: l2_atom_ - value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0) - tips: - L2 Hit: - #alias: l2_hit_ - value: - ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else 0)), 0) - tips: - L2 Rd Lat: - #alias: l2_rd_lat_ - value: - ROUND(AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) - if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else None)), - 0) - tips: - L2 Wr Lat: - #alias: l2_wr_lat_ - value: - ROUND(AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + - TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - != 0) else None)), 0) - tips: - - # ---------------------------------------- - # Fabric Block - Fabric_L2 Rd: - #alias: l2_fabric_rd_ - value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0) - tips: - Fabric_L2 Wr: - #alias: l2_fabric_wr_ - value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0) - tips: - Fabric_L2 Atomic: - #alias: l2_fabric_atom_ - value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0) - tips: - - Fabric Rd Lat: - #alias: fabric_rd_lat_ - value: - ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else 0)), 0) - tips: - Fabric Wr Lat: - #alias: fabric_wr_lat_ - value: - ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else 0)), 0) - tips: - Fabric Atomic Lat: - #alias: fabric_atom_lat_ - value: - ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum - != 0) else 0)), 0) - tips: - - HBM Rd: - #alias: hbm_rd_ - value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0) - tips: - HBM Wr: - #alias: hbm_wr_ - value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0) - tips: - - comparable: false # for now - cli_style: mem_chart diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0300_memory_chart.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0300_memory_chart.yaml new file mode 100644 index 0000000000..ecfbf69b6a --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0300_memory_chart.yaml @@ -0,0 +1,269 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 300 + title: Memory Chart + metrics_description: + Wavefront Occupancy: Wavefronts per active CU. + Wave Life: Average number of cycles executing a wave. + SALU: Total Number of SALU (Scalar ALU) instructions issued per normalization + unit. + SMEM: Total number of SMEM (Scalar Memory Read) instructions issued normalization + unit. + VALU: The number of VALU (Vector ALU) instructions issued per normalization unit. + MFMA: Total number of MFMA (Matrix-Fused-Multiply-Add) instructions issued per + normalization unit. + VMEM: The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch + memory) per normalization unit. + LDS: The total number of LDS instructions (including, but not limited to, read/write/atomics + and HIP's __shfl instructions) executed per normalization unit. + GWS: Total number of GDS (global data sync) instructions issued per normalization + unit. + BR: Total number of BRANCH instructions issued per normalization unit. + Active CUs: Total number of active compute units (CUs) on the accelerator during + the kernel execution. + Num CUs: Total number of compute units (CUs) on the accelerator. + VGPR: 'The number of architected vector general-purpose registers allocated for + the kernel, see VALU. Note: this may not exactly match the number of VGPRs requested + by the compiler due to allocation granularity.' + SGPR: 'The number of scalar general-purpose registers allocated for the kernel, + see SALU. Note: this may not exactly match the number of SGPRs requested by + the compiler due to allocation granularity.' + LDS Allocation: 'The number of bytes of LDS memory (or, shared memory) allocated + for this kernel. Note: This may also be larger than what was requested at compile + time due to both allocation granularity and dynamic per-dispatch LDS allocations.' + Scratch Allocation: The number of bytes of scratch memory requested per work-item + for this kernel. Scratch memory is used for stack memory on the accelerator, + as well as for register spills and restores. + Wavefronts: The total number of wavefronts, summed over all workgroups, forming + this kernel launch. + Workgroups: The total number of workgroups forming this kernel launch. + LDS Req: The total number of LDS instructions (including, but not limited to, + read/write/atomics and HIP's __shfl instructions) executed per normalization + unit. + LDS Util: Indicates what percent of the kernel's duration the LDS was actively + executing instructions (including, but not limited to, load, store, atomic and + HIP's __shfl operations). Calculated as the ratio of the total number of cycles + LDS was active over the total CU cycles. + LDS Latency: The average number of round-trip cycles (i.e., from issue to data-return + / acknowledgment) required for an LDS instruction to complete. + VL1 Rd: The total number of incoming read requests from the address processing + unit after coalescing per normalization unit + VL1 Wr: The total number of incoming write requests from the address processing + unit after coalescing per normalization unit + VL1 Atomic: The total number of incoming atomic requests from the address processing + unit after coalescing per normalization unit + VL1 Hit: The ratio of the number of vL1D cache line requests that hit in vL1D + cache over the total number of cache line requests to the vL1D Cache RAM. + VL1 Lat: Calculated as the average number of cycles that a vL1D cache line request + spent in the vL1D cache pipeline. + VL1 Coalesce: Indicates how well memory instructions were coalesced by the address + processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). Calculated + as the average number of thread-requests generated per instruction divided by + the ideal number of thread-requests per instruction. + VL1 Stall: The ratio of the number of cycles where the vL1D is stalled waiting + to issue a request for data to the L2 cache divided by the number of cycles + where the vL1D is active. + VL1_L2 Rd: The number of read requests for a vL1D cache line that were not satisfied + by the vL1D and must be retrieved from the to the L2 Cache per normalization + unit. + VL1_L2 Wr: The number of write requests to a vL1D cache line that were sent through + the vL1D to the L2 cache, per normalization unit. + VL1_L2 Atomic: The number of atomic requests that are sent through the vL1D to + the L2 cache, per normalization unit. This includes requests for atomics with, + and without return. + sL1D Rd: The total number of requests, of any size or type, made to the sL1D per + normalization unit. + sL1D Hit: The total number of sL1D requests that hit on a previously loaded cache + line, per normalization unit. + sL1D_L2 Rd: The total number of read requests from sL1D to the L2, per normalization + unit. + sL1D_L2 Wr: The total number of write requests from sL1D to the L2, per normalization + unit. Typically unused on current CDNA accelerators. + sL1D_L2 Atomic: The total number of atomic requests from sL1D to the L2, per normalization + unit. Typically unused on current CDNA accelerators. + IL1 Fetch: The total number of requests made to the L1I per normalization-unit. + IL1 Hit: The percent of L1I requests that hit on a previously loaded line the + cache. Calculated as the ratio of the number of L1I requests that hit over the + number of all L1I requests. + IL1 Lat: The average number of cycles spent to fetch instructions to a CU. + IL1_L2 Rd: The total number of requests across the L1I - L2 interface per normalization-unit. + L2 Rd: The total number of read requests to the L2 from all clients. + L2 Wr: The total number of write requests to the L2 from all clients. + L2 Atomic: The total number of atomic requests (with and without return) to the + L2 from all clients. + L2 Hit: The ratio of the number of L2 cache line requests that hit in the L2 cache + over the total number of incoming cache line requests to the L2 cache. + L2 Rd Lat: Calculated as the average number of cycles that the vL1D cache took + to issue and receive read requests from the L2 Cache. This number also includes + requests for atomics with return values. + L2 Wr Lat: Calculated as the average number of cycles that the vL1D cache took + to issue and receive acknowledgement of a write request to the L2 Cache. This + number also includes requests for atomics without return values. + Fabric_L2 Rd: Number of L2 cache - Infinity Fabric read requests (either 32-byte + or 64-byte) summed over TCC instances per normalization unit. + Fabric_L2 Wr: Number of L2 cache - Infinity Fabric write requests (either 32-byte + or 64-byte) summed over TCC instances per normalization unit. + Fabric_L2 Atomic: Number of L2 cache - Infinity Fabric write requests (either + 32-byte or 64-byte) that are actually atomic requests summed over TCC instances + per normalization unit. + Fabric Rd Lat: The time-averaged number of cycles read requests spent in Infinity + Fabric before data was returned to the L2. + Fabric Wr Lat: The time-averaged number of cycles write requests spent in Infinity + Fabric before a completion acknowledgement was returned to the L2. + Fabric Atomic Lat: The time-averaged number of cycles atomic requests spent in + Infinity Fabric before a completion acknowledgement (atomic without return value) + or data (atomic with return value) was returned to the L2. + HBM Rd: The total number of L2 requests to Infinity Fabric to read 32B or 64B + of data from the accelerator's local HBM, per normalization unit. + HBM Wr: 'The total number of L2 requests to Infinity Fabric to write or atomically + update 32B or 64B of data in the accelerator''s local HBM, per normalization + unit. ' + data source: + - metric_table: + id: 301 + title: Memory Chart + header: + metric: Metric + value: Value + metric: + Wavefront Occupancy: + value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs), + 0) + coll_level: SQ_LEVEL_WAVES + Wave Life: + value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0) else + 0)), 0) + SALU: + value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0) + SMEM: + value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0) + VALU: + value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0) + MFMA: + value: ROUND(AVG((SQ_INSTS_MFMA / $denom)), 0) + VMEM: + value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0) + LDS: + value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) + GWS: + value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0) + BR: + value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0) + Active CUs: + value: $numActiveCUs + Num CUs: + value: $cu_per_gpu + VGPR: + value: ROUND(AVG(Arch_VGPR), 0) + SGPR: + value: ROUND(AVG(SGPR), 0) + LDS Allocation: + value: ROUND(AVG(LDS_Per_Workgroup), 0) + Scratch Allocation: + value: ROUND(AVG(Scratch_Per_Workitem), 0) + Wavefronts: + value: ROUND(AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE), + 0) + Workgroups: + value: ROUND(AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + + SPI_CS3_NUM_THREADGROUPS), 0) + LDS Req: + value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) + LDS Util: + value: ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))), 0) + LDS Latency: + value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS + != 0) else None)),0) + coll_level: SQ_INST_LEVEL_LDS + VL1 Rd: + value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0) + VL1 Wr: + value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0) + VL1 Atomic: + value: ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)), 0) + VL1 Hit: + value: ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None )), 0) + VL1 Lat: + value: ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum + != 0) else None)), 0) + VL1 Coalesce: + value: ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum + * 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0) + VL1 Stall: + value: ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None)), 0) + VL1_L2 Rd: + value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0) + VL1_L2 Wr: + value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0) + VL1_L2 Atomic: + value: ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)), 0) + sL1D Rd: + value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0) + sL1D Hit: + value: ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ + != 0) else None)) * 100), 0) + sL1D Lat: + value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ + != 0) else None)) * 100), 0) + coll_level: SQC_DCACHE_INFLIGHT_LEVEL + sL1D_L2 Rd: + value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0) + sL1D_L2 Wr: + value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0) + sL1D_L2 Atomic: + value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0) + IL1 Fetch: + value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0) + IL1 Hit: + value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0) + IL1 Lat: + value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ + != 0) else None)) * 100), 0) + IL1_L2 Rd: + value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0) + L2 Rd: + value: ROUND(AVG((TCC_READ_sum / $denom)), 0) + L2 Wr: + value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0) + L2 Atomic: + value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0) + L2 Hit: + value: ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if + ((TCC_HIT_sum + TCC_MISS_sum) != 0) else 0)), 0) + L2 Rd Lat: + value: ROUND(AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + != 0) else None)), 0) + L2 Wr Lat: + value: ROUND(AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + != 0) else None)), 0) + Fabric_L2 Rd: + value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0) + Fabric_L2 Wr: + value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0) + Fabric_L2 Atomic: + value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0) + Fabric Rd Lat: + value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else 0)), 0) + Fabric Wr Lat: + value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else 0)), 0) + Fabric Atomic Lat: + value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else 0)), 0) + HBM Rd: + value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0) + HBM Wr: + value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0) + comparable: false + cli_style: mem_chart + tui_style: mem_chart diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline.yaml new file mode 100644 index 0000000000..41c8bac547 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline.yaml @@ -0,0 +1,9 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 400 + title: Roofline + metrics_description: {} + data source: + - None: + id: 401 + title: Roofline diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline_info.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline_info.yaml deleted file mode 100644 index 1474b85cf2..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline_info.yaml +++ /dev/null @@ -1,8 +0,0 @@ ---- -Panel Config: - id: 400 - title: Roofline - data source: - - None: - id: 401 - title: Roofline \ No newline at end of file diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0500_command-processor.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0500_command-processor.yaml deleted file mode 100644 index 3e1a0d5f65..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0500_command-processor.yaml +++ /dev/null @@ -1,153 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 500 - title: Command Processor (CPC/CPF) - data source: - - metric_table: - id: 501 - title: Command Processor Fetcher - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - CPF Utilization: - avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) - if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) - min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) - if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) - max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) - if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) - unit: pct - tips: - CPF Stall: - avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None)) - min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None)) - max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None)) - unit: pct - tips: - CPF-L2 Utilization: - avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) - if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) - min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) - if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) - max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) - if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) - unit: pct - tips: - CPF-L2 Stall: - avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY - != 0) else None)) - min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY - != 0) else None)) - max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY - != 0) else None)) - unit: pct - tips: - CPF-UTCL1 Stall: - avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None) - min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None) - max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None) - unit: pct - tips: - - - metric_table: - id: 502 - title: Packet Processor - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - CPC SYNC FIFO Full Rate: - avg: AVG((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY != 0) else None) - min: MIN((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY != 0) else None) - max: MAX((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY != 0) else None) - unit: pct - tips: - CPC CANE Stall Rate: - avg: AVG((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None) - min: MIN((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None) - max: MAX((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None) - unit: pct - tips: - CPC ADC Utilization: - avg: AVG((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None) - min: MIN((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None) - max: MAX((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None) - unit: pct - tips: - CPC Utilization: - avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) - if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) - min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) - if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) - max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) - if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) - unit: pct - tips: - CPC Stall Rate: - avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - unit: pct - tips: - CPC Packet Decoding Utilization: - avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) - min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) - max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) - unit: pct - tips: - CPC-Workgroup Manager Utilization: - avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) - min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) - max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY != 0) else None) - unit: Pct - tips: - CPC-L2 Utilization: - avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) - if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) - min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) - if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) - max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) - if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) - unit: pct - tips: - CPC-UTCL1 Stall: - avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None) - min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None) - max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None) - unit: pct - tips: - CPC-UTCL2 Utilization: - avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) - if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) - min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) - if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) - max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) - if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) - unit: pct - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0500_command_processor_cpc_cpf.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0500_command_processor_cpc_cpf.yaml new file mode 100644 index 0000000000..722514277c --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0500_command_processor_cpc_cpf.yaml @@ -0,0 +1,166 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 500 + title: Command Processor (CPC/CPF) + metrics_description: + CPF Utilization: Percent of total cycles where the CPF was busy actively doing + any work. The ratio of CPF busy cycles over total cycles counted by the CPF. + CPF Stall: Percent of CPF busy cycles where the CPF was stalled for any reason. + CPF-L2 Utilization: Percent of total cycles counted by the CPF-L2 interface where + the CPF-L2 interface was active doing any work. The ratio of CPF-L2 busy cycles + over total cycles counted by the CPF-L2. + CPF-L2 Stall: Percent of CPF-L2 L2 busy cycles where the CPF-L2 interface was + stalled for any reason. + CPF-UTCL1 Stall: Percent of CPF busy cycles where the CPF was stalled by address + translation. + CPC Utilization: Percent of total cycles where the CPC was busy actively doing + any work. The ratio of CPC busy cycles over total cycles counted by the CPC. + CPC Stall Rate: Percent of CPC busy cycles where the CPC was stalled for any reason. + CPC Packet Decoding Utilization: Percent of CPC busy cycles spent decoding commands + for processing. + CPC-Workgroup Manager Utilization: Percent of CPC busy cycles spent dispatching + workgroups to the workgroup manager. + CPC-L2 Utilization: Percent of total cycles counted by the CPC-L2 interface where + the CPC-L2 interface was active doing any work. + CPC-UTCL1 Stall: Percent of CPC busy cycles where the CPC was stalled by address + translation + CPC-UTCL2 Utilization: 'Percent of total cycles counted by the CPC''s L2 address + translation interface where the CPC was busy doing address translation work. ' + data source: + - metric_table: + id: 501 + title: Command processor fetcher (CPF) + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + CPF Utilization: + avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) + if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) + min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) + if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) + max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) + if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) + unit: pct + CPF Stall: + avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None)) + min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None)) + max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None)) + unit: pct + CPF-L2 Utilization: + avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) + if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) + min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) + if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) + max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) + if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) + unit: pct + CPF-L2 Stall: + avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY + != 0) else None)) + min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY + != 0) else None)) + max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY + != 0) else None)) + unit: pct + CPF-UTCL1 Stall: + avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) + if (CPF_CPF_STAT_BUSY != 0) else None) + min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) + if (CPF_CPF_STAT_BUSY != 0) else None) + max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) + if (CPF_CPF_STAT_BUSY != 0) else None) + unit: pct + - metric_table: + id: 502 + title: Command processor packet processor (CPC) + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + CPC SYNC FIFO Full Rate: + avg: AVG((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY + != 0) else None) + min: MIN((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY + != 0) else None) + max: MAX((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY + != 0) else None) + unit: pct + CPC CANE Stall Rate: + avg: AVG((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) + else None) + min: MIN((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) + else None) + max: MAX((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) + else None) + unit: pct + CPC ADC Utilization: + avg: AVG((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None) + min: MIN((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None) + max: MAX((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None) + unit: pct + CPC Utilization: + avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) + if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) + min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) + if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) + max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) + if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) + unit: pct + CPC Stall Rate: + avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + unit: pct + CPC Packet Decoding Utilization: + avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + unit: pct + CPC-Workgroup Manager Utilization: + avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + unit: Pct + CPC-L2 Utilization: + avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) + if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) + min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) + if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) + max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) + if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) + unit: pct + CPC-UTCL1 Stall: + avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if + (CPC_CPC_STAT_BUSY != 0) else None) + min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if + (CPC_CPC_STAT_BUSY != 0) else None) + max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) if + (CPC_CPC_STAT_BUSY != 0) else None) + unit: pct + CPC-UTCL2 Utilization: + avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) + if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) + min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) + if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) + max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) + if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) + unit: pct diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0600_shader-processor-input.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0600_shader-processor-input.yaml deleted file mode 100644 index d2b5944459..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0600_shader-processor-input.yaml +++ /dev/null @@ -1,188 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 600 - title: Workgroup Manager (SPI) - data source: - - metric_table: - id: 601 - title: Workgroup Manager Utilizations - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Schedule-Pipe Wave Occupancy: - avg: AVG(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY + SPI_CSQ_P3_OCCUPANCY) - min: MIN(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY + SPI_CSQ_P3_OCCUPANCY) - max: MAX(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY + SPI_CSQ_P3_OCCUPANCY) - unit: Wave - tips: - Accelerator Utilization: - avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) - min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) - max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) - unit: Pct - tips: - Scheduler-Pipe Utilization: - avg: AVG(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - min: MIN(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - max: MAX(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - unit: Pct - tips: - Scheduler-Pipe Wave Utilization: - avg: AVG(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - min: MIN(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - max: MAX(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - unit: Pct - tips: - Workgroup Manager Utilization: - avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) - min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) - max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) - unit: Pct - tips: - Shader Engine Utilization: - avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) - min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) - max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) - unit: Pct - tips: - SIMD Utilization: - avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Dispatched Workgroups: - avg: AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) - min: MIN(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) - max: MAX(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) - unit: Workgroups - tips: - Dispatched Wavefronts: - avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - unit: Wavefronts - tips: - VGPR Writes: - avg: AVG((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else - None)) - min: MIN((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else - None)) - max: MAX((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else - None)) - unit: Cycles/wave - tips: - SGPR Writes: - avg: AVG((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else - None)) - min: MIN((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else - None)) - max: MAX((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else - None)) - unit: Cycles/wave - tips: - - metric_table: - id: 602 - title: Workgroup Manager - Resource Allocation - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Not-scheduled Rate (Workgroup Manager): - avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None) - min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None) - max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None) - unit: Pct - tips: - Not-scheduled Rate (Scheduler-Pipe): - avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None) - min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None) - max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None) - unit: Pct - tips: - Scheduler-Pipe FIFO Full Rate: - avg: AVG((100 * (SPI_CS0_CRAWLER_STALL + SPI_CS1_CRAWLER_STALL + SPI_CS2_CRAWLER_STALL + SPI_CS3_CRAWLER_STALL) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None) - min: MIN((100 * (SPI_CS0_CRAWLER_STALL + SPI_CS1_CRAWLER_STALL + SPI_CS2_CRAWLER_STALL + SPI_CS3_CRAWLER_STALL) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None) - max: MAX((100 * (SPI_CS0_CRAWLER_STALL + SPI_CS1_CRAWLER_STALL + SPI_CS2_CRAWLER_STALL + SPI_CS3_CRAWLER_STALL) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None) - unit: Pct - tips: - Scheduler-Pipe Stall Rate: - avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None)) - min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None)) - max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != - 0) else None)) - unit: Pct - tips: - Scratch Stall Rate: - avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - unit: Pct - tips: - Insufficient SIMD Waveslots: - avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Insufficient SIMD VGPRs: - avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Insufficient SIMD SGPRs: - avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Insufficient CU LDS: - avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Insufficient CU Barriers: - avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Reached CU Workgroup Limit: - avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: - Reached CU Wavefront Limit: - avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0600_workgroup_manager_spi.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0600_workgroup_manager_spi.yaml new file mode 100644 index 0000000000..c32f4ded90 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0600_workgroup_manager_spi.yaml @@ -0,0 +1,237 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 600 + title: Workgroup Manager (SPI) + metrics_description: + Accelerator Utilization: The percent of cycles in the kernel where the accelerator + was actively doing any work. + Scheduler-Pipe Utilization: The percent of total scheduler-pipe cycles in the + kernel where the scheduler-pipes were actively doing any work. + Workgroup Manager Utilization: The percent of cycles in the kernel where the workgroup + manager was actively doing any work. + Shader Engine Utilization: The percent of total shader engine cycles in the kernel + where any CU in a shader-engine was actively doing any work, normalized over + all shader-engines. Low values (e.g., << 100%) indicate that the accelerator + was not fully saturated by the kernel, or a potential load-imbalance issue. + SIMD Utilization: The percent of total SIMD cycles in the kernel where any SIMD + on a CU was actively doing any work, summed over all CUs. Low values (less than + 100%) indicate that the accelerator was not fully saturated by the kernel, or + a potential load-imbalance issue. + Dispatched Workgroups: The total number of workgroups forming this kernel launch. + Dispatched Wavefronts: The total number of wavefronts, summed over all workgroups, + forming this kernel launch. + VGPR Writes: The average number of cycles spent initializing VGPRs at wave creation. + SGPR Writes: The average number of cycles spent initializing SGPRs at wave creation. + Not-scheduled Rate (Workgroup Manager): The percent of total scheduler-pipe cycles + in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck + within the workgroup manager rather than a lack of a CU or SIMD with sufficient + resources. + Not-scheduled Rate (Scheduler-Pipe): 'The percent of total scheduler-pipe cycles + in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck + within the scheduler-pipes rather than a lack of a CU or SIMD with sufficient + resources. ' + Scheduler-Pipe Stall Rate: The percent of total scheduler-pipe cycles in the kernel + where a workgroup could not be scheduled to a CU due to occupancy limitations + (like a lack of a CU or SIMD with sufficient resources). + Scratch Stall Rate: The percent of total shader-engine cycles in the kernel where + a workgroup could not be scheduled to a CU due to lack of private (a.k.a., scratch) + memory slots. While this can reach up to 100%, note that the actual occupancy + limitations on a kernel using private memory are typically quite small (for + example, less than 1% of the total number of waves that can be scheduled to + an accelerator). + Insufficient SIMD Waveslots: The percent of total SIMD cycles in the kernel where + a workgroup could not be scheduled to a SIMD due to lack of available waveslots. + Insufficient SIMD VGPRs: The percent of total SIMD cycles in the kernel where + a workgroup could not be scheduled to a SIMD due to lack of available VGPRs. + Insufficient SIMD SGPRs: The percent of total SIMD cycles in the kernel where + a workgroup could not be scheduled to a SIMD due to lack of available SGPRs. + Insufficient CU LDS: The percent of total CU cycles in the kernel where a workgroup + could not be scheduled to a CU due to lack of available LDS. + Insufficient CU Barriers: The percent of total CU cycles in the kernel where a + workgroup could not be scheduled to a CU due to lack of available barriers. + Reached CU Workgroup Limit: The percent of total CU cycles in the kernel where + a workgroup could not be scheduled to a CU due to limits within the workgroup + manager. This is expected to be always be zero on CDNA2 or newer accelerators + (and small for previous accelerators). + Reached CU Wavefront Limit: The percent of total CU cycles in the kernel where + a wavefront could not be scheduled to a CU due to limits within the workgroup + manager. This is expected to be always be zero on CDNA2 or newer accelerators + (and small for previous accelerators). + data source: + - metric_table: + id: 601 + title: Workgroup manager utilizations + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Schedule-Pipe Wave Occupancy: + avg: AVG(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY + + SPI_CSQ_P3_OCCUPANCY) + min: MIN(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY + + SPI_CSQ_P3_OCCUPANCY) + max: MAX(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY + + SPI_CSQ_P3_OCCUPANCY) + unit: Wave + Accelerator Utilization: + avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) + min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) + max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) + unit: Pct + Scheduler-Pipe Utilization: + avg: AVG(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) + / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) + min: MIN(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) + / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) + max: MAX(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) + / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) + unit: Pct + Scheduler-Pipe Wave Utilization: + avg: AVG(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu + * $se_per_gpu)) + min: MIN(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu + * $se_per_gpu)) + max: MAX(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu + * $se_per_gpu)) + unit: Pct + Workgroup Manager Utilization: + avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) + min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) + max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) + unit: Pct + Shader Engine Utilization: + avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) + min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) + max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) + unit: Pct + SIMD Utilization: + avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Dispatched Workgroups: + avg: AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + + SPI_CS3_NUM_THREADGROUPS) + min: MIN(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + + SPI_CS3_NUM_THREADGROUPS) + max: MAX(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + + SPI_CS3_NUM_THREADGROUPS) + unit: Workgroups + Dispatched Wavefronts: + avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) + min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) + max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) + unit: Wavefronts + VGPR Writes: + avg: AVG((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) + min: MIN((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) + max: MAX((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) + unit: Cycles/wave + SGPR Writes: + avg: AVG((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) + != 0) else None)) + min: MIN((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) + != 0) else None)) + max: MAX((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) + != 0) else None)) + unit: Cycles/wave + - metric_table: + id: 602 + title: Workgroup Manager - Resource Allocation + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Not-scheduled Rate (Workgroup Manager): + avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + unit: Pct + Not-scheduled Rate (Scheduler-Pipe): + avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + unit: Pct + Scheduler-Pipe FIFO Full Rate: + avg: AVG((100 * (SPI_CS0_CRAWLER_STALL + SPI_CS1_CRAWLER_STALL + SPI_CS2_CRAWLER_STALL + + SPI_CS3_CRAWLER_STALL) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if + ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + min: MIN((100 * (SPI_CS0_CRAWLER_STALL + SPI_CS1_CRAWLER_STALL + SPI_CS2_CRAWLER_STALL + + SPI_CS3_CRAWLER_STALL) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if + ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + max: MAX((100 * (SPI_CS0_CRAWLER_STALL + SPI_CS1_CRAWLER_STALL + SPI_CS2_CRAWLER_STALL + + SPI_CS3_CRAWLER_STALL) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if + ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + unit: Pct + Scheduler-Pipe Stall Rate: + avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) + min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) + max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) + unit: Pct + Scratch Stall Rate: + avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + unit: Pct + Insufficient SIMD Waveslots: + avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Insufficient SIMD VGPRs: + avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Insufficient SIMD SGPRs: + avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Insufficient CU LDS: + avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Insufficient CU Barriers: + avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Reached CU Workgroup Limit: + avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Reached CU Wavefront Limit: + avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0700_wavefront-launch.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0700_wavefront-launch.yaml deleted file mode 100644 index 927cc3d2e9..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0700_wavefront-launch.yaml +++ /dev/null @@ -1,142 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 700 - title: Wavefront - data source: - - metric_table: - id: 701 - title: Wavefront Launch Stats - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Grid Size: - avg: AVG(Grid_Size) - min: MIN(Grid_Size) - max: MAX(Grid_Size) - unit: Work Items - tips: - Workgroup Size: - avg: AVG(Workgroup_Size) - min: MIN(Workgroup_Size) - max: MAX(Workgroup_Size) - unit: Work Items - tips: - Total Wavefronts: - avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - unit: Wavefronts - tips: - Saved Wavefronts: - avg: AVG(SQ_WAVES_SAVED) - min: MIN(SQ_WAVES_SAVED) - max: MAX(SQ_WAVES_SAVED) - unit: Wavefronts - tips: - Restored Wavefronts: - avg: AVG(SQ_WAVES_RESTORED) - min: MIN(SQ_WAVES_RESTORED) - max: MAX(SQ_WAVES_RESTORED) - unit: Wavefronts - tips: - VGPRs: - avg: AVG(Arch_VGPR) - min: MIN(Arch_VGPR) - max: MAX(Arch_VGPR) - unit: Registers - tips: - AGPRs: - avg: AVG(Accum_VGPR) - min: MIN(Accum_VGPR) - max: MAX(Accum_VGPR) - unit: Registers - tips: - SGPRs: - avg: AVG(SGPR) - min: MIN(SGPR) - max: MAX(SGPR) - unit: Registers - tips: - LDS Allocation: - avg: AVG(LDS_Per_Workgroup) - min: MIN(LDS_Per_Workgroup) - max: MAX(LDS_Per_Workgroup) - unit: Bytes - tips: - Scratch Allocation: - avg: AVG(Scratch_Per_Workitem) - min: MIN(Scratch_Per_Workitem) - max: MAX(Scratch_Per_Workitem) - unit: Bytes/Workitem - tips: - - - metric_table: - id: 702 - title: Wavefront Runtime Stats - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Kernel Time: - avg: AVG((End_Timestamp - Start_Timestamp)) - min: MIN((End_Timestamp - Start_Timestamp)) - max: MAX((End_Timestamp - Start_Timestamp)) - unit: ns - tips: - Kernel Time (Cycles): - avg: AVG($GRBM_GUI_ACTIVE_PER_XCD) - min: MIN($GRBM_GUI_ACTIVE_PER_XCD) - max: MAX($GRBM_GUI_ACTIVE_PER_XCD) - unit: Cycle - tips: - Instructions per wavefront: - avg: AVG((SQ_INSTS / SQ_WAVES)) - min: MIN((SQ_INSTS / SQ_WAVES)) - max: MAX((SQ_INSTS / SQ_WAVES)) - unit: Instr/wavefront - tips: - Wave Cycles: - avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom)) - min: MIN(((4 * SQ_WAVE_CYCLES) / $denom)) - max: MAX(((4 * SQ_WAVE_CYCLES) / $denom)) - unit: (Cycles + $normUnit) - tips: - Dependency Wait Cycles: - avg: AVG(((4 * SQ_WAIT_ANY) / $denom)) - min: MIN(((4 * SQ_WAIT_ANY) / $denom)) - max: MAX(((4 * SQ_WAIT_ANY) / $denom)) - unit: (Cycles + $normUnit) - tips: - Issue Wait Cycles: - avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom)) - min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom)) - max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom)) - unit: (Cycles + $normUnit) - tips: - Active Cycles: - avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom)) - min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom)) - max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom)) - unit: (Cycles + $normUnit) - tips: - Wavefront Occupancy: - avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - unit: Wavefronts - coll_level: SQ_LEVEL_WAVES - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0700_wavefront.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0700_wavefront.yaml new file mode 100644 index 0000000000..25679c6207 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0700_wavefront.yaml @@ -0,0 +1,173 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 700 + title: Wavefront + metrics_description: + Grid Size: The total number of work-items (or, threads) launched as a part of + the kernel dispatch. In HIP, this is equivalent to the total grid size multiplied + by the total workgroup (or, block) size. + Workgroup Size: The total number of work-items (or, threads) in each workgroup + (or, block) launched as part of the kernel dispatch. In HIP, this is equivalent + to the total block size. + Total Wavefronts: "The total number of wavefronts launched as part of the kernel\ + \ dispatch. On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs,\ + \ the wavefront size is always 64 work-items. Thus, the total number of wavefronts\ + \ should be equivalent to the ceiling of grid size divided by 64." + Saved Wavefronts: The total number of wavefronts saved at a context-save. + Restored Wavefronts: The total number of wavefronts restored from a context-save. + VGPRs: 'The number of architected vector general-purpose registers allocated for + the kernel, see VALU. Note: this may not exactly match the number of VGPRs requested + by the compiler due to allocation granularity.' + AGPRs: 'The number of accumulation vector general-purpose registers allocated + for the kernel, see AGPRs. Note: this may not exactly match the number of AGPRs + requested by the compiler due to allocation granularity.' + SGPRs: 'The number of scalar general-purpose registers allocated for the kernel, + see SALU. Note: this may not exactly match the number of SGPRs requested by + the compiler due to allocation granularity.' + LDS Allocation: 'The number of bytes of LDS memory (or, shared memory) allocated + for this kernel. Note: This may also be larger than what was requested at compile + time due to both allocation granularity and dynamic per-dispatch LDS allocations.' + Scratch Allocation: The number of bytes of scratch memory requested per work-item + for this kernel. Scratch memory is used for stack memory on the accelerator, + as well as for register spills and restores. + Kernel Time: The total duration of the executed kernel. + Kernel Time (Cycles): The total duration of the executed kernel in cycles. + Instructions per wavefront: The average number of instructions (of all types) + executed per wavefront. This is averaged over all wavefronts in a kernel dispatch. + Wave Cycles: The number of cycles a wavefront in the kernel dispatch spent resident + on a compute unit per normalization unit. This is averaged over all wavefronts + in a kernel dispatch. + Dependency Wait Cycles: The number of cycles a wavefront in the kernel dispatch + spent resident on a compute unit per normalization unit. This is averaged over + all wavefronts in a kernel dispatch. + Issue Wait Cycles: The number of cycles a wavefront in the kernel dispatch was + unable to issue an instruction for any reason (e.g., execution pipe back-pressure, + arbitration loss, etc.) per normalization unit. This counter is incremented + at every cycle by all wavefronts on a CU unable to issue an instruction. As + such, it is most useful to get a sense of how waves were spending their time, + rather than identification of a precise limiter because another wave could be + actively executing while a wave is issue stalled. The sum of this metric, Dependency + Wait Cycles and Active Cycles should be equal to the total Wave Cycles metric. + Active Cycles: The average number of cycles a wavefront in the kernel dispatch + was actively executing instructions per normalization unit. This measurement + is made on a per-wavefront basis, and may include cycles that another wavefront + spent actively executing (on another execution unit, for example) or was stalled. + As such, it is most useful to get a sense of how waves were spending their time, + rather than identification of a precise limiter. The sum of this metric, Issue + Wait Cycles and Active Wait Cycles should be equal to the total Wave Cycles + metric. + Wavefront Occupancy: 'The time-averaged number of wavefronts resident on the accelerator + over the lifetime of the kernel. Note: this metric may be inaccurate for short-running + kernels (less than 1ms).' + data source: + - metric_table: + id: 701 + title: Wavefront Launch Stats + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Grid Size: + avg: AVG(Grid_Size) + min: MIN(Grid_Size) + max: MAX(Grid_Size) + unit: Work Items + Workgroup Size: + avg: AVG(Workgroup_Size) + min: MIN(Workgroup_Size) + max: MAX(Workgroup_Size) + unit: Work Items + Total Wavefronts: + avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) + min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) + max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) + unit: Wavefronts + Saved Wavefronts: + avg: AVG(SQ_WAVES_SAVED) + min: MIN(SQ_WAVES_SAVED) + max: MAX(SQ_WAVES_SAVED) + unit: Wavefronts + Restored Wavefronts: + avg: AVG(SQ_WAVES_RESTORED) + min: MIN(SQ_WAVES_RESTORED) + max: MAX(SQ_WAVES_RESTORED) + unit: Wavefronts + VGPRs: + avg: AVG(Arch_VGPR) + min: MIN(Arch_VGPR) + max: MAX(Arch_VGPR) + unit: Registers + AGPRs: + avg: AVG(Accum_VGPR) + min: MIN(Accum_VGPR) + max: MAX(Accum_VGPR) + unit: Registers + SGPRs: + avg: AVG(SGPR) + min: MIN(SGPR) + max: MAX(SGPR) + unit: Registers + LDS Allocation: + avg: AVG(LDS_Per_Workgroup) + min: MIN(LDS_Per_Workgroup) + max: MAX(LDS_Per_Workgroup) + unit: Bytes + Scratch Allocation: + avg: AVG(Scratch_Per_Workitem) + min: MIN(Scratch_Per_Workitem) + max: MAX(Scratch_Per_Workitem) + unit: Bytes/Workitem + - metric_table: + id: 702 + title: Wavefront Runtime Stats + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Kernel Time: + avg: AVG((End_Timestamp - Start_Timestamp)) + min: MIN((End_Timestamp - Start_Timestamp)) + max: MAX((End_Timestamp - Start_Timestamp)) + unit: ns + Kernel Time (Cycles): + avg: AVG($GRBM_GUI_ACTIVE_PER_XCD) + min: MIN($GRBM_GUI_ACTIVE_PER_XCD) + max: MAX($GRBM_GUI_ACTIVE_PER_XCD) + unit: Cycle + Instructions per wavefront: + avg: AVG((SQ_INSTS / SQ_WAVES)) + min: MIN((SQ_INSTS / SQ_WAVES)) + max: MAX((SQ_INSTS / SQ_WAVES)) + unit: Instr/wavefront + Wave Cycles: + avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom)) + min: MIN(((4 * SQ_WAVE_CYCLES) / $denom)) + max: MAX(((4 * SQ_WAVE_CYCLES) / $denom)) + unit: (Cycles + $normUnit) + Dependency Wait Cycles: + avg: AVG(((4 * SQ_WAIT_ANY) / $denom)) + min: MIN(((4 * SQ_WAIT_ANY) / $denom)) + max: MAX(((4 * SQ_WAIT_ANY) / $denom)) + unit: (Cycles + $normUnit) + Issue Wait Cycles: + avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom)) + min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom)) + max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom)) + unit: (Cycles + $normUnit) + Active Cycles: + avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom)) + min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom)) + max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom)) + unit: (Cycles + $normUnit) + Wavefront Occupancy: + avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + unit: Wavefronts + coll_level: SQ_LEVEL_WAVES diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1000_compute-unit-instruction-mix.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1000_compute-unit-instruction-mix.yaml deleted file mode 100644 index bdc273dfad..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1000_compute-unit-instruction-mix.yaml +++ /dev/null @@ -1,289 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1000 - title: Compute Units - Instruction Mix - data source: - - metric_table: - id: 1001 - title: Overall Instruction Mix - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - VALU: - avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) - min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) - max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) - unit: (instr + $normUnit) - tips: - VMEM: - # TODO: need to fix this when the new FLAT/LDS counts - # are present in ROCm - avg: AVG(((SQ_INSTS_VMEM) / $denom)) - min: MIN(((SQ_INSTS_VMEM) / $denom)) - max: MAX(((SQ_INSTS_VMEM) / $denom)) - unit: (instr + $normUnit) - tips: - LDS: - # TODO: need to fix this when the new FLAT/LDS counts - # are present in ROCm - avg: AVG((SQ_INSTS_LDS / $denom)) - min: MIN((SQ_INSTS_LDS / $denom)) - max: MAX((SQ_INSTS_LDS / $denom)) - unit: (instr + $normUnit) - tips: - MFMA: - avg: AVG((SQ_INSTS_MFMA / $denom)) - min: MIN((SQ_INSTS_MFMA / $denom)) - max: MAX((SQ_INSTS_MFMA / $denom)) - unit: (instr + $normUnit) - tips: - SALU: - avg: AVG((SQ_INSTS_SALU / $denom)) - min: MIN((SQ_INSTS_SALU / $denom)) - max: MAX((SQ_INSTS_SALU / $denom)) - unit: (instr + $normUnit) - tips: - SMEM: - avg: AVG((SQ_INSTS_SMEM / $denom)) - min: MIN((SQ_INSTS_SMEM / $denom)) - max: MAX((SQ_INSTS_SMEM / $denom)) - unit: (instr + $normUnit) - tips: - Branch: - avg: AVG((SQ_INSTS_BRANCH / $denom)) - min: MIN((SQ_INSTS_BRANCH / $denom)) - max: MAX((SQ_INSTS_BRANCH / $denom)) - unit: (instr + $normUnit) - tips: - - - metric_table: - id: 1002 - title: VALU Arithmetic Instr Mix - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - INT32: - avg: AVG((SQ_INSTS_VALU_INT32 / $denom)) - min: MIN((SQ_INSTS_VALU_INT32 / $denom)) - max: MAX((SQ_INSTS_VALU_INT32 / $denom)) - unit: (instr + $normUnit) - tips: - INT64: - avg: AVG((SQ_INSTS_VALU_INT64 / $denom)) - min: MIN((SQ_INSTS_VALU_INT64 / $denom)) - max: MAX((SQ_INSTS_VALU_INT64 / $denom)) - unit: (instr + $normUnit) - tips: - F16-ADD: - avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom)) - unit: (instr + $normUnit) - tips: - F16-MUL: - avg: AVG((SQ_INSTS_VALU_MUL_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_MUL_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_MUL_F16 / $denom)) - unit: (instr + $normUnit) - tips: - F16-FMA: - avg: AVG((SQ_INSTS_VALU_FMA_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_FMA_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_FMA_F16 / $denom)) - unit: (instr + $normUnit) - tips: - F16-Trans: - avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom)) - unit: (instr + $normUnit) - tips: - F32-ADD: - avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom)) - unit: (instr + $normUnit) - tips: - F32-MUL: - avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom)) - unit: (instr + $normUnit) - tips: - F32-FMA: - avg: AVG((SQ_INSTS_VALU_FMA_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_FMA_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_FMA_F32 / $denom)) - unit: (instr + $normUnit) - tips: - F32-Trans: - avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom)) - unit: (instr + $normUnit) - tips: - F64-ADD: - avg: AVG((SQ_INSTS_VALU_ADD_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_ADD_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_ADD_F64 / $denom)) - unit: (instr + $normUnit) - tips: - F64-MUL: - avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom)) - unit: (instr + $normUnit) - tips: - F64-FMA: - avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom)) - unit: (instr + $normUnit) - tips: - F64-Trans: - avg: AVG((SQ_INSTS_VALU_TRANS_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_TRANS_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_TRANS_F64 / $denom)) - unit: (instr + $normUnit) - tips: - Conversion: - avg: AVG((SQ_INSTS_VALU_CVT / $denom)) - min: MIN((SQ_INSTS_VALU_CVT / $denom)) - max: MAX((SQ_INSTS_VALU_CVT / $denom)) - unit: (instr + $normUnit) - tips: - - - metric_table: - id: 1003 - title: VMEM Instr Mix - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Global/Generic Instr: - avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Global/Generic Read: - avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Global/Generic Write: - avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Global/Generic Atomic: - avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Spill/Stack Instr: - avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Spill/Stack Coalesceable Instr: - avg: AVG((TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Spill/Stack Read: - avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Spill/Stack Write: - avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - Spill/Stack Atomic: - avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - tips: - - - metric_table: - id: 1004 - title: MFMA Arithmetic Instr Mix - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - MFMA-I8: - avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom)) - unit: (instr + $normUnit) - tips: - MFMA-F8: - avg: AVG((SQ_INSTS_VALU_MFMA_F8 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F8 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F8 / $denom)) - unit: (instr + $normUnit) - tips: - MFMA-F16: - avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom)) - unit: (instr + $normUnit) - tips: - MFMA-BF16: - avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom)) - unit: (instr + $normUnit) - tips: - MFMA-F32: - avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom)) - unit: (instr + $normUnit) - tips: - MFMA-F64: - avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom)) - unit: (instr + $normUnit) - tips: - MFMA-F6F4: - avg: AVG((SQ_INSTS_VALU_MFMA_F6F4 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F6F4 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F6F4 / $denom)) - unit: (instr + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1000_compute_units_instruction_mix.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1000_compute_units_instruction_mix.yaml new file mode 100644 index 0000000000..3a40d83f61 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1000_compute_units_instruction_mix.yaml @@ -0,0 +1,319 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1000 + title: Compute Units - Instruction Mix + metrics_description: + VALU: The total number of vector arithmetic logic unit (VALU) operations issued. + These are the workhorses of the compute unit, and are used to execute a wide + range of instruction types including floating point operations, non-uniform + address calculations, transcendental operations, integer operations, shifts, + conditional evaluation, etc. + VMEM: The total number of vector memory operations issued. These include most + loads, stores and atomic operations and all accesses to generic, global, private + and texture memory. + LDS: The total number of LDS (also known as shared memory) operations issued. + These include loads, stores, atomics, and HIP's __shfl operations. + MFMA: The total number of matrix fused multiply-add instructions issued. + SALU: The total number of scalar arithmetic logic unit (SALU) operations issued. + Typically these are used for address calculations, literal constants, and other + operations that are provably uniform across a wavefront. Although scalar memory + (SMEM) operations are issued by the SALU, they are counted separately in this + section. + SMEM: The total number of scalar memory (SMEM) operations issued. These are typically + used for loading kernel arguments, base-pointers and loads from HIP's __constant__ + memory. + Branch: The total number of branch operations issued. These typically consist + of jump or branch operations and are used to implement control flow. + INT32: The total number of instructions operating on 32-bit integer operands issued + to the VALU per normalization unit. + INT64: The total number of instructions operating on 64-bit integer operands issued + to the VALU per normalization unit. + F16-ADD: The total number of addition instructions operating on 16-bit floating-point + operands issued to the VALU per normalization unit. + F16-MUL: The total number of multiplication instructions operating on 16-bit floating-point + operands issued to the VALU per normalization unit. + F16-FMA: The total number of fused multiply-add instructions operating on 16-bit + floating-point operands issued to the VALU per normalization unit. + F16-Trans: The total number of transcendental instructions (e.g., sqrt) operating + on 16-bit floating-point operands issued to the VALU per normalization unit. + F32-ADD: The total number of addition instructions operating on 32-bit floating-point + operands issued to the VALU per normalization unit. + F32-MUL: The total number of multiplication instructions operating on 32-bit floating-point + operands issued to the VALU per normalization unit. + F32-FMA: The total number of fused multiply-add instructions operating on 32-bit + floating-point operands issued to the VALU per normalization unit. + F32-Trans: The total number of transcendental instructions (such as sqrt) operating + on 32-bit floating-point operands issued to the VALU per normalization unit. + F64-ADD: The total number of addition instructions operating on 64-bit floating-point + operands issued to the VALU per normalization unit. + F64-MUL: The total number of multiplication instructions operating on 64-bit floating-point + operands issued to the VALU per normalization unit. + F64-FMA: The total number of fused multiply-add instructions operating on 64-bit + floating-point operands issued to the VALU per normalization unit. + F64-Trans: The total number of transcendental instructions (such as sqrt) operating + on 64-bit floating-point operands issued to the VALU per normalization unit. + Conversion: "The total number of type conversion instructions (such as converting\ + \ data to or from F32\u2194F64) issued to the VALU per normalization unit." + Global/Generic Instr: The total number of global & generic memory instructions + executed on all compute units on the accelerator, per normalization unit. + Global/Generic Read: The total number of global & generic memory read instructions + executed on all compute units on the accelerator, per normalization unit. + Global/Generic Write: The total number of global & generic memory write instructions + executed on all compute units on the accelerator, per normalization unit. + Global/Generic Atomic: The total number of global & generic memory atomic (with + and without return) instructions executed on all compute units on the accelerator, + per normalization unit. + Spill/Stack Instr: The total number of spill/stack memory instructions executed + on all compute units on the accelerator, per normalization unit. + Spill/Stack Read: The total number of spill/stack memory read instructions executed + on all compute units on the accelerator, per normalization unit. + Spill/Stack Write: The total number of spill/stack memory write instructions executed + on all compute units on the accelerator, per normalization unit. + Spill/Stack Atomic: The total number of spill/stack memory atomic (with and without + return) instructions executed on all compute units on the accelerator, per normalization + unit. Typically unused as these memory operations are typically used to implement + thread-local storage. + MFMA-I8: The total number of 8-bit integer MFMA instructions issued per normalization + unit. + MFMA-F8: The total number of 8-bit floating point MFMA instructions issued per + normalization unit. This is supported in AMD Instinct MI300 series and later + only. + MFMA-F16: The total number of 16-bit floating point MFMA instructions issued per + normalization unit. + MFMA-BF16: The total number of 16-bit brain floating point MFMA instructions issued + per normalization unit. + MFMA-F32: The total number of 32-bit floating-point MFMA instructions issued per + normalization unit. + MFMA-F64: The total number of 64-bit floating-point MFMA instructions issued per + normalization unit. + data source: + - metric_table: + id: 1001 + title: Overall Instruction Mix + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + VALU: + avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) + min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) + max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) + unit: (instr + $normUnit) + VMEM: + avg: AVG(((SQ_INSTS_VMEM) / $denom)) + min: MIN(((SQ_INSTS_VMEM) / $denom)) + max: MAX(((SQ_INSTS_VMEM) / $denom)) + unit: (instr + $normUnit) + LDS: + avg: AVG((SQ_INSTS_LDS / $denom)) + min: MIN((SQ_INSTS_LDS / $denom)) + max: MAX((SQ_INSTS_LDS / $denom)) + unit: (instr + $normUnit) + MFMA: + avg: AVG((SQ_INSTS_MFMA / $denom)) + min: MIN((SQ_INSTS_MFMA / $denom)) + max: MAX((SQ_INSTS_MFMA / $denom)) + unit: (instr + $normUnit) + SALU: + avg: AVG((SQ_INSTS_SALU / $denom)) + min: MIN((SQ_INSTS_SALU / $denom)) + max: MAX((SQ_INSTS_SALU / $denom)) + unit: (instr + $normUnit) + SMEM: + avg: AVG((SQ_INSTS_SMEM / $denom)) + min: MIN((SQ_INSTS_SMEM / $denom)) + max: MAX((SQ_INSTS_SMEM / $denom)) + unit: (instr + $normUnit) + Branch: + avg: AVG((SQ_INSTS_BRANCH / $denom)) + min: MIN((SQ_INSTS_BRANCH / $denom)) + max: MAX((SQ_INSTS_BRANCH / $denom)) + unit: (instr + $normUnit) + - metric_table: + id: 1002 + title: VALU Arithmetic Instruction Mix + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + INT32: + avg: AVG((SQ_INSTS_VALU_INT32 / $denom)) + min: MIN((SQ_INSTS_VALU_INT32 / $denom)) + max: MAX((SQ_INSTS_VALU_INT32 / $denom)) + unit: (instr + $normUnit) + INT64: + avg: AVG((SQ_INSTS_VALU_INT64 / $denom)) + min: MIN((SQ_INSTS_VALU_INT64 / $denom)) + max: MAX((SQ_INSTS_VALU_INT64 / $denom)) + unit: (instr + $normUnit) + F16-ADD: + avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom)) + unit: (instr + $normUnit) + F16-MUL: + avg: AVG((SQ_INSTS_VALU_MUL_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_MUL_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_MUL_F16 / $denom)) + unit: (instr + $normUnit) + F16-FMA: + avg: AVG((SQ_INSTS_VALU_FMA_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_FMA_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_FMA_F16 / $denom)) + unit: (instr + $normUnit) + F16-Trans: + avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom)) + unit: (instr + $normUnit) + F32-ADD: + avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom)) + unit: (instr + $normUnit) + F32-MUL: + avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom)) + unit: (instr + $normUnit) + F32-FMA: + avg: AVG((SQ_INSTS_VALU_FMA_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_FMA_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_FMA_F32 / $denom)) + unit: (instr + $normUnit) + F32-Trans: + avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom)) + unit: (instr + $normUnit) + F64-ADD: + avg: AVG((SQ_INSTS_VALU_ADD_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_ADD_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_ADD_F64 / $denom)) + unit: (instr + $normUnit) + F64-MUL: + avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom)) + unit: (instr + $normUnit) + F64-FMA: + avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom)) + unit: (instr + $normUnit) + F64-Trans: + avg: AVG((SQ_INSTS_VALU_TRANS_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_TRANS_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_TRANS_F64 / $denom)) + unit: (instr + $normUnit) + Conversion: + avg: AVG((SQ_INSTS_VALU_CVT / $denom)) + min: MIN((SQ_INSTS_VALU_CVT / $denom)) + max: MAX((SQ_INSTS_VALU_CVT / $denom)) + unit: (instr + $normUnit) + - metric_table: + id: 1003 + title: VMEM Instruction Mix + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Global/Generic Instr: + avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Global/Generic Read: + avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Global/Generic Write: + avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Global/Generic Atomic: + avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Instr: + avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Coalesceable Instr: + avg: AVG((TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Read: + avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Write: + avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Atomic: + avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + - metric_table: + id: 1004 + title: MFMA Arithmetic Instruction Mix + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + MFMA-I8: + avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom)) + unit: (instr + $normUnit) + MFMA-F8: + avg: AVG((SQ_INSTS_VALU_MFMA_F8 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F8 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F8 / $denom)) + unit: (instr + $normUnit) + MFMA-F16: + avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom)) + unit: (instr + $normUnit) + MFMA-BF16: + avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + unit: (instr + $normUnit) + MFMA-F32: + avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom)) + unit: (instr + $normUnit) + MFMA-F64: + avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom)) + unit: (instr + $normUnit) + MFMA-F6F4: + avg: AVG((SQ_INSTS_VALU_MFMA_F6F4 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F6F4 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F6F4 / $denom)) + unit: (instr + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1100_compute-unit-compute-pipeline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1100_compute-unit-compute-pipeline.yaml deleted file mode 100644 index bd549f024c..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1100_compute-unit-compute-pipeline.yaml +++ /dev/null @@ -1,293 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1100 - title: Compute Units - Compute Pipeline - data source: - - metric_table: - id: 1101 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - peak: Peak - pop: Pct of Peak - tips: Tips - metric: - VALU FLOPs: - value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) - + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) - + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + (64 * (((SQ_INSTS_VALU_ADD_F64 - + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (2 * SQ_INSTS_VALU_FMA_F64)))) - / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) / (((($max_sclk - * $cu_per_gpu) * 64) * 2) / 1000)) - tips: - VALU IOPs: - value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - Start_Timestamp))) - unit: GIOP - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) - tips: - MFMA FLOPs (F8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) - tips: - MFMA FLOPs (BF16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - tips: - MFMA FLOPs (F16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - tips: - MFMA FLOPs (F32): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - tips: - MFMA FLOPs (F64): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 128) / 1000)) - tips: - MFMA FLOPs (F6F4): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 16834) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 16834) / 1000)) - tips: - MFMA IOPs (INT8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GIOP - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) - tips: - - - metric_table: - id: 1102 - title: Pipeline Stats - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - IPC: - avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - unit: Instr/cycle - tips: - IPC (Issued): - avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) - + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) - / SQ_ACTIVE_INST_ANY)) - min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) - + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) - / SQ_ACTIVE_INST_ANY)) - max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) - + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) - / SQ_ACTIVE_INST_ANY)) - unit: Instr/cycle - tips: - SALU Utilization: - avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - tips: - VALU Utilization: - avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - tips: - # Precentage of VALU instructions which are issued to two VALUs at a time - VALU Co-Issue Efficiency: - avg: AVG((100 * SQ_ACTIVE_INST_VALU2) / (SQ_ACTIVE_INST_VALU - SQ_ACTIVE_INST_VALU2)) - min: MIN((100 * SQ_ACTIVE_INST_VALU2) / (SQ_ACTIVE_INST_VALU - SQ_ACTIVE_INST_VALU2)) - max: MAX((100 * SQ_ACTIVE_INST_VALU2) / (SQ_ACTIVE_INST_VALU - SQ_ACTIVE_INST_VALU2)) - unit: pct - tips: - VMEM Utilization: - avg: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - tips: - Branch Utilization: - avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - tips: - VALU Active Threads: - avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - unit: Threads - tips: - MFMA Utilization: - avg: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) - min: MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) - max: MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) - unit: pct - tips: - MFMA Instr Cycles: - avg: AVG(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA != 0) - else None)) - min: MIN(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA != 0) - else None)) - max: MAX(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA != 0) - else None)) - unit: cycles/instr - tips: - VMEM Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) - else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) - else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) - else None)) - unit: Cycles - coll_level: SQ_INST_LEVEL_VMEM - tips: - SMEM Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) - else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) - else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) - else None)) - unit: Cycles - coll_level: SQ_INST_LEVEL_SMEM - tips: - - - metric_table: - id: 1103 - title: Arithmetic Operations - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - FLOPs (Total): - avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) - + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 - * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) - + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / - $denom)) - min: MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) - + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 - * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) - + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / - $denom)) - max: MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) - + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 - * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) - + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / - $denom)) - unit: (OPs + $normUnit) - tips: - IOPs (Total): - avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / $denom) - min: MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / $denom) - max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / $denom) - unit: (OPs + $normUnit) - tips: - F8 OPs: - avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) - min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) - max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) - unit: (OPs + $normUnit) - tips: - F16 OPs: - avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + - (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 * - SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) - min: MIN(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + - (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 * - SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) - max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + - (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 * - SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) - unit: (OPs + $normUnit) - tips: - BF16 OPs: - avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) - min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) - max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) - unit: (OPs + $normUnit) - tips: - F32 OPs: - avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) / $denom)) - min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) / $denom)) - max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) / $denom)) - unit: (OPs + $normUnit) - tips: - F64 OPs: - avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) - min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) - max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) - unit: (OPs + $normUnit) - tips: - F6F4 OPs: - avg: AVG((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom) - min: MIN((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom) - max: MAX((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom) - unit: (OPs + $normUnit) - tips: - INT8 OPs: - avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) - min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) - max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) - unit: (OPs + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1100_compute_units_compute_pipeline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1100_compute_units_compute_pipeline.yaml new file mode 100644 index 0000000000..329e28d6e8 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1100_compute_units_compute_pipeline.yaml @@ -0,0 +1,346 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1100 + title: Compute Units - Compute Pipeline + metrics_description: + VALU FLOPs: 'The total floating-point operations executed per second on the VALU. + This is also presented as a percent of the peak theoretical FLOPs achievable + on the specific accelerator. Note: this does not include any floating-point + operations from MFMA instructions.' + VALU IOPs: 'The total integer operations executed per second on the VALU. This + is also presented as a percent of the peak theoretical IOPs achievable on the + specific accelerator. Note: this does not include any integer operations from + MFMA instructions.' + MFMA FLOPs (BF16): 'The total number of 16-bit brain floating point MFMA operations + executed per second. Note: this does not include any 16-bit brain floating point + operations from VALU instructions. This is also presented as a percent of the + peak theoretical BF16 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F16): 'The total number of 16-bit floating point MFMA operations executed + per second. Note: this does not include any 16-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F16 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F32): 'The total number of 32-bit floating point MFMA operations executed + per second. Note: this does not include any 32-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F32 MFMA operations achievable on the specific accelerator.' + MFMA FLOPs (F64): 'The total number of 64-bit floating point MFMA operations executed + per second. Note: this does not include any 64-bit floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F64 MFMA operations achievable on the specific accelerator.' + MFMA IOPs (INT8): 'The total number of 8-bit integer MFMA operations executed + per second. Note: this does not include any 8-bit integer operations from VALU + instructions. This is also presented as a percent of the peak theoretical INT8 + MFMA operations achievable on the specific accelerator.' + IPC: The ratio of the total number of instructions executed on the CU over the + total active CU cycles. + IPC (Issued): The ratio of the total number of (non-internal) instructions issued + over the number of cycles where the scheduler was actively working on issuing + instructions. + SALU Utilization: Indicates what percent of the kernel's duration the SALU was + busy executing instructions. Computed as the ratio of the total number of cycles + spent by the scheduler issuing SALU / SMEM instructions over the total CU cycles. + VALU Utilization: Indicates what percent of the kernel's duration the VALU was + busy executing instructions. Does not include VMEM operations. Computed as the + ratio of the total number of cycles spent by the scheduler issuing VALU instructions + over the total CU cycles. + VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit + was busy executing instructions, including both global/generic and spill/scratch + operations (see the VMEM instruction count metrics for more detail). Does not + include VALU operations. Computed as the ratio of the total number of cycles + spent by the scheduler issuing VMEM instructions over the total CU cycles. + Branch Utilization: Indicates what percent of the kernel's duration the branch + unit was busy executing instructions. Computed as the ratio of the total number + of cycles spent by the scheduler issuing branch instructions over the total + CU cycles. + VALU Active Threads: Indicates the average level of divergence within a wavefront + over the lifetime of the kernel. The number of work-items that were active in + a wavefront during execution of each VALU instruction, time-averaged over all + VALU instructions run on all wavefronts in the kernel + MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit + was busy executing instructions. Computed as the ratio of the total number of + cycles spent by the MFMA was busy over the total CU cycles. + MFMA Instruction Cycles: The average duration of MFMA instructions in this kernel + in cycles. Computed as the ratio of the total number of cycles the MFMA unit + was busy over the total number of MFMA instructions. + VMEM Latency: The average number of round-trip cycles (that is, from issue to + data return / acknowledgment) required for a VMEM instruction to complete. + SMEM Latency: The average number of round-trip cycles (that is, from issue to + data return / acknowledgment) required for a SMEM instruction to complete. + FLOPs (Total): The total number of floating-point operations executed on either + the VALU or MFMA units, per normalization unit. + IOPs (Total): The total number of integer operations executed on either the VALU + or MFMA units, per normalization unit. + F16 OPs: The total number of 16-bit floating-point operations executed on either + the VALU or MFMA units, per normalization unit. + BF16 OPs: The total number of 16-bit brain floating-point operations executed + on either the VALU or MFMA units, per normalization unit. + F32 OPs: The total number of 32-bit floating-point operations executed on either + the VALU or MFMA units, per normalization unit. + F64 OPs: The total number of 64-bit floating-point operations executed on either + the VALU or MFMA units, per normalization unit. + INT8 OPs: The total number of 8-bit integer operations executed on either the + VALU or MFMA units, per normalization unit. + data source: + - metric_table: + id: 1101 + title: Compute Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + peak: Peak + pop: Pct of Peak + metric: + VALU FLOPs: + value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) + / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) + VALU IOPs: + value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp + - Start_Timestamp))) + unit: GIOP + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp + - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) + MFMA FLOPs (F8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) + MFMA FLOPs (BF16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + MFMA FLOPs (F16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + MFMA FLOPs (F32): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + MFMA FLOPs (F64): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000)) + MFMA FLOPs (F6F4): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 16834) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 16834) / 1000)) + MFMA IOPs (INT8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GIOP + peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - + Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) + - metric_table: + id: 1102 + title: Pipeline Statistics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + IPC: + avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + unit: Instr/cycle + IPC (Issued): + avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) + / SQ_ACTIVE_INST_ANY)) + min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) + / SQ_ACTIVE_INST_ANY)) + max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) + / SQ_ACTIVE_INST_ANY)) + unit: Instr/cycle + SALU Utilization: + avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + VALU Utilization: + avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + VALU Co-Issue Efficiency: + avg: AVG((100 * SQ_ACTIVE_INST_VALU2) / (SQ_ACTIVE_INST_VALU - SQ_ACTIVE_INST_VALU2)) + min: MIN((100 * SQ_ACTIVE_INST_VALU2) / (SQ_ACTIVE_INST_VALU - SQ_ACTIVE_INST_VALU2)) + max: MAX((100 * SQ_ACTIVE_INST_VALU2) / (SQ_ACTIVE_INST_VALU - SQ_ACTIVE_INST_VALU2)) + unit: pct + VMEM Utilization: + avg: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + min: MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + max: MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + unit: pct + Branch Utilization: + avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + min: MIN((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + max: MAX((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + VALU Active Threads: + avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + unit: Threads + MFMA Utilization: + avg: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) + min: MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) + max: MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) + unit: pct + MFMA Instruction Cycles: + avg: AVG(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA != + 0) else None)) + min: MIN(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA != + 0) else None)) + max: MAX(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA != + 0) else None)) + unit: cycles/instr + VMEM Latency: + avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) + else None)) + min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) + else None)) + max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) + else None)) + unit: Cycles + coll_level: SQ_INST_LEVEL_VMEM + SMEM Latency: + avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) + else None)) + min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) + else None)) + max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) + else None)) + unit: Cycles + coll_level: SQ_INST_LEVEL_SMEM + - metric_table: + id: 1103 + title: Arithmetic Operations + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + FLOPs (Total): + avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom)) + min: MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom)) + max: MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom)) + unit: (OPs + $normUnit) + IOPs (Total): + avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 + * 512)) / $denom) + min: MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 + * 512)) / $denom) + max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 + * 512)) / $denom) + unit: (OPs + $normUnit) + F8 OPs: + avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) + min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) + max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) + unit: (OPs + $normUnit) + F16 OPs: + avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 + * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) + min: MIN(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 + * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) + max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 + * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) + unit: (OPs + $normUnit) + BF16 OPs: + avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) + min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) + max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) + unit: (OPs + $normUnit) + F32 OPs: + avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + / $denom)) + min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + / $denom)) + max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + / $denom)) + unit: (OPs + $normUnit) + F64 OPs: + avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + unit: (OPs + $normUnit) + F6F4 OPs: + avg: AVG((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom) + min: MIN((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom) + max: MAX((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom) + unit: (OPs + $normUnit) + INT8 OPs: + avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + unit: (OPs + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1200_lds.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1200_lds.yaml deleted file mode 100644 index 3b0669d4de..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1200_lds.yaml +++ /dev/null @@ -1,166 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1200 - title: Local Data Share (LDS) - data source: - - metric_table: - id: 1201 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - tips: Tips - metric: - Utilization: - value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: Pct of Peak - tips: - Access Rate: - value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: Pct of Peak - tips: - Theoretical Bandwidth (% of Peak): - value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * 0.00128))) - unit: Pct of Peak - tips: - Bank Conflict Rate: - value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - unit: Pct of Peak - tips: - comparable: false # for now - cli_style: simple_bar - - - metric_table: - id: 1202 - title: LDS Stats - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - LDS Instrs: - avg: AVG((SQ_INSTS_LDS / $denom)) - min: MIN((SQ_INSTS_LDS / $denom)) - max: MAX((SQ_INSTS_LDS / $denom)) - unit: (Instr + $normUnit) - tips: - LDS LOAD: - avg: AVG((SQ_INSTS_LDS_LOAD / $denom)) - min: MIN((SQ_INSTS_LDS_LOAD / $denom)) - max: MAX((SQ_INSTS_LDS_LOAD / $denom)) - unit: (instr + $normUnit) - tips: - LDS STORE: - avg: AVG((SQ_INSTS_LDS_STORE / $denom)) - min: MIN((SQ_INSTS_LDS_STORE / $denom)) - max: MAX((SQ_INSTS_LDS_STORE / $denom)) - unit: (instr + $normUnit) - tips: - LDS ATOMIC: - avg: AVG((SQ_INSTS_LDS_ATOMIC / $denom)) - min: MIN((SQ_INSTS_LDS_ATOMIC / $denom)) - max: MAX((SQ_INSTS_LDS_ATOMIC / $denom)) - unit: (instr + $normUnit) - tips: - LDS LOAD Bandwidth: - avg: AVG(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - min: MIN(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - max: MAX(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - units: Gbps - tips: - LDS STORE Bandwidth: - avg: AVG(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - min: MIN(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - max: MAX(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - units: Gbps - tips: - LDS ATOMIC Bandwidth: - avg: AVG(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - min: MIN(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - max: MAX(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - units: Gbps - tips: - Theoretical Bandwidth: - avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / $denom)) - min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / $denom)) - max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / $denom)) - unit: (Bytes + $normUnit) - tips: - LDS Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else None)) - unit: Cycles - coll_level: SQ_INST_LEVEL_LDS - tips: - Bank Conflicts/Access: - avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - unit: Conflicts/Access - tips: - Index Accesses: - avg: AVG((SQ_LDS_IDX_ACTIVE / $denom)) - min: MIN((SQ_LDS_IDX_ACTIVE / $denom)) - max: MAX((SQ_LDS_IDX_ACTIVE / $denom)) - unit: (Cycles + $normUnit) - tips: - Atomic Return Cycles: - avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom)) - min: MIN((SQ_LDS_ATOMIC_RETURN / $denom)) - max: MAX((SQ_LDS_ATOMIC_RETURN / $denom)) - unit: (Cycles + $normUnit) - tips: - Bank Conflict: - avg: AVG((SQ_LDS_BANK_CONFLICT / $denom)) - min: MIN((SQ_LDS_BANK_CONFLICT / $denom)) - max: MAX((SQ_LDS_BANK_CONFLICT / $denom)) - unit: (Cycles + $normUnit) - tips: - Addr Conflict: - avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom)) - min: MIN((SQ_LDS_ADDR_CONFLICT / $denom)) - max: MAX((SQ_LDS_ADDR_CONFLICT / $denom)) - unit: (Cycles + $normUnit) - tips: - Unaligned Stall: - avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom)) - min: MIN((SQ_LDS_UNALIGNED_STALL / $denom)) - max: MAX((SQ_LDS_UNALIGNED_STALL / $denom)) - unit: (Cycles + $normUnit) - tips: - Mem Violations: - avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom)) - min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom)) - max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom)) - unit: (Accesses + $normUnit) - tips: - LDS Command FIFO Full Rate: - avg: AVG((SQ_LDS_CMD_FIFO_FULL / $denom)) - min: MIN((SQ_LDS_CMD_FIFO_FULL / $denom)) - max: MAX((SQ_LDS_CMD_FIFO_FULL / $denom)) - unit: (Cycles + $normUnit) - tips: - LDS Data FIFO Full Rate: - avg: AVG((SQ_LDS_DATA_FIFO_FULL / $denom)) - min: MIN((SQ_LDS_DATA_FIFO_FULL / $denom)) - max: MAX((SQ_LDS_DATA_FIFO_FULL / $denom)) - unit: (Cycles + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1200_local_data_share_lds.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1200_local_data_share_lds.yaml new file mode 100644 index 0000000000..0609c0a203 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1200_local_data_share_lds.yaml @@ -0,0 +1,181 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1200 + title: Local Data Share (LDS) + metrics_description: + Utilization: Indicates what percent of the kernel's duration the LDS was actively + executing instructions (including, but not limited to, load, store, atomic and + HIP's __shfl operations). Calculated as the ratio of the total number of cycles + LDS was active over the total CU cycles. + Access Rate: Indicates the percentage of SIMDs in the VALU actively issuing LDS + instructions, averaged over the lifetime of the kernel. Calculated as the ratio + of the total number of cycles spent by the scheduler issuing LDS instructions + over the total CU cycles. + Theoretical Bandwidth: Indicates the maximum amount of bytes that could have been + loaded from, stored to, or atomically updated in the LDS per normalization unit. + Does not take into account the execution mask of the wavefront when the instruction + was executed. + Bank Conflict Rate: Indicates the percentage of active LDS cycles that were spent + servicing bank conflicts. Calculated as the ratio of LDS cycles spent servicing + bank conflicts over the number of LDS cycles that would have been required to + move the same amount of data in an uncontended access. + LDS Instructions: The total number of LDS instructions (including, but not limited + to, read/write/atomics and HIP's __shfl instructions) executed per normalization + unit. + LDS Latency: The average number of round-trip cycles (i.e., from issue to data-return + / acknowledgment) required for an LDS instruction to complete. + Bank Conflicts/Access: The ratio of the number of cycles spent in the LDS scheduler + due to bank conflicts (as determined by the conflict resolution hardware) to + the base number of cycles that would be spent in the LDS scheduler in a completely + uncontended case. This is the unnormalized form of the Bank Conflict Rate. + Index Accesses: The total number of cycles spent in the LDS scheduler over all + operations per normalization unit. + Atomic Return Cycles: The total number of cycles spent on LDS atomics with return + per normalization unit. + Bank Conflict: The total number of cycles spent in the LDS scheduler due to bank + conflicts (as determined by the conflict resolution hardware) per normalization + unit. + Addr Conflict: The total number of cycles spent in the LDS scheduler due to address + conflicts (as determined by the conflict resolution hardware) per normalization + unit. + Unaligned Stall: The total number of cycles spent in the LDS scheduler due to + stalls from non-dword aligned addresses per normalization unit. + Mem Violations: "The total number of out-of-bounds accesses made to the LDS, per\ + \ normalization unit. This is unused and expected to be zero in most configurations\ + \ for modern CDNA\u2122 accelerators." + data source: + - metric_table: + id: 1201 + title: LDS Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + metric: + Utilization: + value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: Pct of Peak + Access Rate: + value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: Pct of Peak + Theoretical Bandwidth (% of Peak): + value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * 0.00128))) + unit: Pct of Peak + Bank Conflict Rate: + value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + unit: Pct of Peak + comparable: false + cli_style: simple_bar + tui_style: simple_bar + - metric_table: + id: 1202 + title: LDS Statistics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + LDS Instructions: + avg: AVG((SQ_INSTS_LDS / $denom)) + min: MIN((SQ_INSTS_LDS / $denom)) + max: MAX((SQ_INSTS_LDS / $denom)) + unit: (Instr + $normUnit) + LDS LOAD: + avg: AVG((SQ_INSTS_LDS_LOAD / $denom)) + min: MIN((SQ_INSTS_LDS_LOAD / $denom)) + max: MAX((SQ_INSTS_LDS_LOAD / $denom)) + unit: (instr + $normUnit) + LDS STORE: + avg: AVG((SQ_INSTS_LDS_STORE / $denom)) + min: MIN((SQ_INSTS_LDS_STORE / $denom)) + max: MAX((SQ_INSTS_LDS_STORE / $denom)) + unit: (instr + $normUnit) + LDS ATOMIC: + avg: AVG((SQ_INSTS_LDS_ATOMIC / $denom)) + min: MIN((SQ_INSTS_LDS_ATOMIC / $denom)) + max: MAX((SQ_INSTS_LDS_ATOMIC / $denom)) + unit: (instr + $normUnit) + LDS LOAD Bandwidth: + avg: AVG(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + min: MIN(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + max: MAX(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + units: Gbps + LDS STORE Bandwidth: + avg: AVG(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + min: MIN(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + max: MAX(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + units: Gbps + LDS ATOMIC Bandwidth: + avg: AVG(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + min: MIN(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + max: MAX(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + units: Gbps + Theoretical Bandwidth: + avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / $denom)) + min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / $denom)) + max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / $denom)) + unit: (Bytes + $normUnit) + LDS Latency: + avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else + None)) + min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else + None)) + max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) else + None)) + unit: Cycles + coll_level: SQ_INST_LEVEL_LDS + Bank Conflicts/Access: + avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + unit: Conflicts/Access + Index Accesses: + avg: AVG((SQ_LDS_IDX_ACTIVE / $denom)) + min: MIN((SQ_LDS_IDX_ACTIVE / $denom)) + max: MAX((SQ_LDS_IDX_ACTIVE / $denom)) + unit: (Cycles + $normUnit) + Atomic Return Cycles: + avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom)) + min: MIN((SQ_LDS_ATOMIC_RETURN / $denom)) + max: MAX((SQ_LDS_ATOMIC_RETURN / $denom)) + unit: (Cycles + $normUnit) + Bank Conflict: + avg: AVG((SQ_LDS_BANK_CONFLICT / $denom)) + min: MIN((SQ_LDS_BANK_CONFLICT / $denom)) + max: MAX((SQ_LDS_BANK_CONFLICT / $denom)) + unit: (Cycles + $normUnit) + Addr Conflict: + avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom)) + min: MIN((SQ_LDS_ADDR_CONFLICT / $denom)) + max: MAX((SQ_LDS_ADDR_CONFLICT / $denom)) + unit: (Cycles + $normUnit) + Unaligned Stall: + avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom)) + min: MIN((SQ_LDS_UNALIGNED_STALL / $denom)) + max: MAX((SQ_LDS_UNALIGNED_STALL / $denom)) + unit: (Cycles + $normUnit) + Mem Violations: + avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom)) + min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom)) + max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom)) + unit: (Accesses + $normUnit) + LDS Command FIFO Full Rate: + avg: AVG((SQ_LDS_CMD_FIFO_FULL / $denom)) + min: MIN((SQ_LDS_CMD_FIFO_FULL / $denom)) + max: MAX((SQ_LDS_CMD_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) + LDS Data FIFO Full Rate: + avg: AVG((SQ_LDS_DATA_FIFO_FULL / $denom)) + min: MIN((SQ_LDS_DATA_FIFO_FULL / $denom)) + max: MAX((SQ_LDS_DATA_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1300_instruction-cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1300_instruction-cache.yaml deleted file mode 100644 index 209a42726e..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1300_instruction-cache.yaml +++ /dev/null @@ -1,105 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1300 - title: Instruction Cache - data source: - - metric_table: - id: 1301 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - tips: Tips - metric: - Bandwidth: - value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) - * (End_Timestamp - Start_Timestamp)))) - unit: Pct of Peak - tips: - Cache Hit Rate: - value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - unit: Pct of Peak - tips: - L1I-L2 Bandwidth: - value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu) - * (End_Timestamp - Start_Timestamp)))) - unit: Pct of Peak - tips: - comparable: false # for now - cli_style: simple_bar - - - metric_table: - id: 1302 - title: Instruction Cache Accesses - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Req: - avg: AVG((SQC_ICACHE_REQ / $denom)) - min: MIN((SQC_ICACHE_REQ / $denom)) - max: MAX((SQC_ICACHE_REQ / $denom)) - unit: (Req + $normUnit) - tips: - Hits: - avg: AVG((SQC_ICACHE_HITS / $denom)) - min: MIN((SQC_ICACHE_HITS / $denom)) - max: MAX((SQC_ICACHE_HITS / $denom)) - unit: (Hits + $normUnit) - tips: - Misses - Non Duplicated: - avg: AVG((SQC_ICACHE_MISSES / $denom)) - min: MIN((SQC_ICACHE_MISSES / $denom)) - max: MAX((SQC_ICACHE_MISSES / $denom)) - unit: (Misses + $normUnit) - tips: - Misses - Duplicated: - avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom)) - min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom)) - max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom)) - unit: (Misses + $normUnit) - tips: - Cache Hit Rate: - avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - unit: pct - tips: - Instruction Fetch Latency: - avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - unit: Cycles - coll_level: SQ_IFETCH_LEVEL - tips: - - metric_table: - id: 1303 - title: Instruction Cache - L2 Interface - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - L1I-L2 Bandwidth: - avg: AVG(((SQC_TC_INST_REQ * 64) / $denom)) - min: MIN(((SQC_TC_INST_REQ * 64) / $denom)) - max: MAX(((SQC_TC_INST_REQ * 64) / $denom)) - unit: (Bytes + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1300_instruction_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1300_instruction_cache.yaml new file mode 100644 index 0000000000..a53c23691f --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1300_instruction_cache.yaml @@ -0,0 +1,106 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1300 + title: Instruction Cache + metrics_description: + Bandwidth: The number of bytes looked up in the L1I cache, as a percent of the + peak theoretical bandwidth. Calculated as the ratio of L1I requests over the + total L1I cycles. + Cache Hit Rate: The percent of L1I requests that hit [#l1i-cache]_ on a previously + loaded line the cache. Calculated as the ratio of the number of L1I requests + that hit over the number of all L1I requests. + L1I-L2 Bandwidth: "The percent of the peak theoretical L1I \u2192 L2 cache request\ + \ bandwidth achieved. Calculated as the ratio of the total number of requests\ + \ from the L1I to the L2 cache over the total L1I-L2 interface cycles." + Req: The total number of requests made to the L1I per normalization-unit + Hits: The total number of L1I requests that hit on a previously loaded cache line, + per normalization-unit. + Misses - Non Duplicated: The total number of L1I requests that missed on a cache + line that were not already pending due to another request, per normalization-unit. + Misses - Duplicated: The total number of L1I requests that missed on a cache line + that were already pending due to another request, per normalization-unit. + Instruction Fetch Latency: The average number of cycles spent to fetch instructions + to a CU. + data source: + - metric_table: + id: 1301 + title: L1I Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + metric: + Bandwidth: + value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * (End_Timestamp + - Start_Timestamp)))) + unit: Pct of Peak + Cache Hit Rate: + value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + unit: Pct of Peak + L1I-L2 Bandwidth: + value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu) + * (End_Timestamp - Start_Timestamp)))) + unit: Pct of Peak + comparable: false + cli_style: simple_bar + tui_style: simple_bar + - metric_table: + id: 1302 + title: L1I cache accesses + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Req: + avg: AVG((SQC_ICACHE_REQ / $denom)) + min: MIN((SQC_ICACHE_REQ / $denom)) + max: MAX((SQC_ICACHE_REQ / $denom)) + unit: (Req + $normUnit) + Hits: + avg: AVG((SQC_ICACHE_HITS / $denom)) + min: MIN((SQC_ICACHE_HITS / $denom)) + max: MAX((SQC_ICACHE_HITS / $denom)) + unit: (Hits + $normUnit) + Misses - Non Duplicated: + avg: AVG((SQC_ICACHE_MISSES / $denom)) + min: MIN((SQC_ICACHE_MISSES / $denom)) + max: MAX((SQC_ICACHE_MISSES / $denom)) + unit: (Misses + $normUnit) + Misses - Duplicated: + avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom)) + min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom)) + max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom)) + unit: (Misses + $normUnit) + Cache Hit Rate: + avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + unit: pct + Instruction Fetch Latency: + avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + unit: Cycles + coll_level: SQ_IFETCH_LEVEL + - metric_table: + id: 1303 + title: L1I <-> L2 interface + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + L1I-L2 Bandwidth: + avg: AVG(((SQC_TC_INST_REQ * 64) / $denom)) + min: MIN(((SQC_TC_INST_REQ * 64) / $denom)) + max: MAX(((SQC_TC_INST_REQ * 64) / $denom)) + unit: (Bytes + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1400_constant-cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1400_constant-cache.yaml deleted file mode 100644 index 669a5834b9..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1400_constant-cache.yaml +++ /dev/null @@ -1,171 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1400 - title: Scalar L1 Data Cache - data source: - - metric_table: - id: 1401 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - tips: Tips - metric: - Bandwidth: - value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) - * (End_Timestamp - Start_Timestamp)))) - unit: Pct of Peak - tips: - Cache Hit Rate: - value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES + SQC_DCACHE_MISSES_DUPLICATE)) - if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - unit: Pct of Peak - tips: - sL1D-L2 BW: - value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 100000) - / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp))) - unit: Pct of Peak - tips: - comparable: false # for now - cli_style: simple_bar - - - metric_table: - id: 1402 - title: Scalar L1D Cache Accesses - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Req: - avg: AVG((SQC_DCACHE_REQ / $denom)) - min: MIN((SQC_DCACHE_REQ / $denom)) - max: MAX((SQC_DCACHE_REQ / $denom)) - unit: (Req + $normUnit) - tips: - Hits: - avg: AVG((SQC_DCACHE_HITS / $denom)) - min: MIN((SQC_DCACHE_HITS / $denom)) - max: MAX((SQC_DCACHE_HITS / $denom)) - unit: (Req + $normUnit) - tips: - Misses - Non Duplicated: - avg: AVG((SQC_DCACHE_MISSES / $denom)) - min: MIN((SQC_DCACHE_MISSES / $denom)) - max: MAX((SQC_DCACHE_MISSES / $denom)) - unit: (Req + $normUnit) - tips: - Misses- Duplicated: - avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom)) - min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom)) - max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom)) - unit: (Req + $normUnit) - tips: - Cache Hit Rate: - avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - unit: pct - tips: - Read Req (Total): - avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) - + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) - min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) - + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) - max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) - + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) - unit: (Req + $normUnit) - tips: - Atomic Req: - avg: AVG((SQC_DCACHE_ATOMIC / $denom)) - min: MIN((SQC_DCACHE_ATOMIC / $denom)) - max: MAX((SQC_DCACHE_ATOMIC / $denom)) - unit: (Req + $normUnit) - tips: - Read Req (1 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_1 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_1 / $denom)) - unit: (Req + $normUnit) - tips: - Read Req (2 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_2 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_2 / $denom)) - unit: (Req + $normUnit) - tips: - Read Req (4 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_4 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_4 / $denom)) - unit: (Req + $normUnit) - tips: - Read Req (8 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_8 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_8 / $denom)) - unit: (Req + $normUnit) - tips: - Read Req (16 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_16 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_16 / $denom)) - unit: (Req + $normUnit) - tips: - - - metric_table: - id: 1403 - title: Scalar L1D Cache - L2 Interface - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - sL1D-L2 BW: - avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom)) - min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom)) - max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) * 64)) / $denom)) - unit: (Bytes + $normUnit) - tips: - Read Req: - avg: AVG((SQC_TC_DATA_READ_REQ / $denom)) - min: MIN((SQC_TC_DATA_READ_REQ / $denom)) - max: MAX((SQC_TC_DATA_READ_REQ / $denom)) - unit: (Req + $normUnit) - tips: - Write Req: - avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom)) - min: MIN((SQC_TC_DATA_WRITE_REQ / $denom)) - max: MAX((SQC_TC_DATA_WRITE_REQ / $denom)) - unit: (Req + $normUnit) - tips: - Atomic Req: - avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)) - min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom)) - max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom)) - unit: (Req + $normUnit) - tips: - Stall Cycles: - avg: AVG((SQC_TC_STALL / $denom)) - min: MIN((SQC_TC_STALL / $denom)) - max: MAX((SQC_TC_STALL / $denom)) - unit: (Cycles + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1400_scalar_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1400_scalar_l1_data_cache.yaml new file mode 100644 index 0000000000..d43157ce8e --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1400_scalar_l1_data_cache.yaml @@ -0,0 +1,186 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1400 + title: Scalar L1 Data Cache + metrics_description: + Bandwidth: The number of bytes looked up in the sL1D cache, as a percent of the + peak theoretical bandwidth. Calculated as the ratio of sL1D requests over the + total sL1D cycles. + Cache Hit Rate: Indicates the percent of sL1D requests that hit on a previously + loaded line the cache. The ratio of the number of sL1D requests that hit over + the number of all sL1D requests. + sL1D-L2 BW: "The total number of bytes read from, written to, or atomically updated\ + \ across the sL1D\u2194L2 interface, per normalization unit. Note that sL1D\ + \ writes and atomics are typically unused on current CDNA accelerators, so in\ + \ the majority of cases this can be interpreted as an sL1D\u2192L2 read bandwidth." + Req: The total number of requests, of any size or type, made to the sL1D per normalization + unit. + Hits: The total number of sL1D requests that hit on a previously loaded cache + line, per normalization unit. + Misses - Non Duplicated: 'The total number of sL1D requests that missed on a cache + line that was not already pending due to another request, per normalization + unit. ' + Misses- Duplicated: The total number of sL1D requests that missed on a cache line + that was already pending due to another request, per normalization unit. + Read Req (Total): The total number of sL1D read requests of any size, per normalization + unit. + Atomic Req: The total number of atomic requests from sL1D to the L2, per normalization + unit. Typically unused on current CDNA accelerators. + Read Req (1 DWord): The total number of sL1D read requests made for a single dword + of data (4B), per normalization unit. + Read Req (2 DWord): The total number of sL1D read requests made for a two dwords + of data (8B), per normalization unit. + Read Req (4 DWord): The total number of sL1D read requests made for a four dwords + of data (16B), per normalization unit. + Read Req (8 DWord): The total number of sL1D read requests made for a eight dwords + of data (32B), per normalization unit. + Read Req (16 DWord): The total number of sL1D read requests made for a sixteen + dwords of data (64B), per normalization unit. + Read Req: The total number of read requests from sL1D to the L2 per normalization + unit. + Write Req: The total number of write requests from sL1D to the L2, per normalization + unit. Typically unused on current CDNA accelerators. + Stall Cycles: "The total number of cycles the sL1D\u2194L2 interface was stalled,\ + \ per normalization unit." + data source: + - metric_table: + id: 1401 + title: Scalar L1D Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + metric: + Bandwidth: + value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * (End_Timestamp + - Start_Timestamp)))) + unit: Pct of Peak + Cache Hit Rate: + value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES + + SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + unit: Pct of Peak + sL1D-L2 BW: + value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp))) + unit: Pct of Peak + comparable: false + cli_style: simple_bar + tui_style: simple_bar + - metric_table: + id: 1402 + title: Scalar L1D cache accesses + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Req: + avg: AVG((SQC_DCACHE_REQ / $denom)) + min: MIN((SQC_DCACHE_REQ / $denom)) + max: MAX((SQC_DCACHE_REQ / $denom)) + unit: (Req + $normUnit) + Hits: + avg: AVG((SQC_DCACHE_HITS / $denom)) + min: MIN((SQC_DCACHE_HITS / $denom)) + max: MAX((SQC_DCACHE_HITS / $denom)) + unit: (Req + $normUnit) + Misses - Non Duplicated: + avg: AVG((SQC_DCACHE_MISSES / $denom)) + min: MIN((SQC_DCACHE_MISSES / $denom)) + max: MAX((SQC_DCACHE_MISSES / $denom)) + unit: (Req + $normUnit) + Misses- Duplicated: + avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom)) + min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom)) + max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom)) + unit: (Req + $normUnit) + Cache Hit Rate: + avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + unit: pct + Read Req (Total): + avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) + min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) + max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG((SQC_DCACHE_ATOMIC / $denom)) + min: MIN((SQC_DCACHE_ATOMIC / $denom)) + max: MAX((SQC_DCACHE_ATOMIC / $denom)) + unit: (Req + $normUnit) + Read Req (1 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_1 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_1 / $denom)) + unit: (Req + $normUnit) + Read Req (2 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_2 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_2 / $denom)) + unit: (Req + $normUnit) + Read Req (4 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_4 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_4 / $denom)) + unit: (Req + $normUnit) + Read Req (8 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_8 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_8 / $denom)) + unit: (Req + $normUnit) + Read Req (16 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_16 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_16 / $denom)) + unit: (Req + $normUnit) + - metric_table: + id: 1403 + title: Scalar L1D Cache - L2 Interface + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + sL1D-L2 BW: + avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 64)) / $denom)) + min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 64)) / $denom)) + max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 64)) / $denom)) + unit: (Bytes + $normUnit) + Read Req: + avg: AVG((SQC_TC_DATA_READ_REQ / $denom)) + min: MIN((SQC_TC_DATA_READ_REQ / $denom)) + max: MAX((SQC_TC_DATA_READ_REQ / $denom)) + unit: (Req + $normUnit) + Write Req: + avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom)) + min: MIN((SQC_TC_DATA_WRITE_REQ / $denom)) + max: MAX((SQC_TC_DATA_WRITE_REQ / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)) + min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom)) + max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom)) + unit: (Req + $normUnit) + Stall Cycles: + avg: AVG((SQC_TC_STALL / $denom)) + min: MIN((SQC_TC_STALL / $denom)) + max: MAX((SQC_TC_STALL / $denom)) + unit: (Cycles + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1500_TA_and_TD.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1500_TA_and_TD.yaml deleted file mode 100644 index 4e295e1fe9..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1500_TA_and_TD.yaml +++ /dev/null @@ -1,210 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1500 - title: Address Processing Unit and Data Return Path (TA/TD) - data source: - - metric_table: - id: 1501 - title: Address Processing Unit - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Address Processing Unit Busy: - avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Address Stall: - avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Data Stall: - avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Data-Processor → Address Stall: - avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Sequencer → TA Address Stall: - avg: AVG((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) - min: MIN((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) - max: MAX((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) - unit: (Cycles + $normUnit) - tips: - Sequencer → TA Command Stall: - avg: AVG((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) - min: MIN((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) - max: MAX((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) - unit: (Cycles + $normUnit) - tips: - Sequencer → TA Data Stall: - avg: AVG((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) - min: MIN((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) - max: MAX((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) - unit: (Cycles + $normUnit) - tips: - Total Instructions: - avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom)) - min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom)) - max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Global/Generic Instructions: - avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Global/Generic Read Instructions: - avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Global/Generic Read Instructions for LDS: - avg: AVG((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Global/Generic Write Instructions: - avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Global/Generic Atomic Instructions: - avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Spill/Stack Instructions: - avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Spill/Stack Read Instructions: - avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Spill/Stack Read Instructions for LDS: - avg: AVG((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Spill/Stack Write Instructions: - avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Spill/Stack Atomic Instructions: - avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Spill/Stack Total Cycles: - avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) - min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) - max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) - unit: (Cycles + $normUnit) - tips: - Spill/Stack Coalesced Read: - avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) - min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) - max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) - unit: (Cycles + $normUnit) - tips: - Spill/Stack Coalesced Write: - avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) - min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) - max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) - unit: (Cycles + $normUnit) - tips: - - - metric_table: - id: 1502 - title: Data-Return Path - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Data-Return Busy: - avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Cache RAM → Data-Return Stall: - avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Workgroup manager → Data-Return Stall: - avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - tips: - Coalescable Instructions: - avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom)) - min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom)) - max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Read Instructions: - avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) - / $denom)) - min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) - / $denom)) - max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) - / $denom)) - unit: (Instructions + $normUnit) - tips: - Write Instructions: - avg: AVG((TD_STORE_WAVEFRONT_sum / $denom)) - min: MIN((TD_STORE_WAVEFRONT_sum / $denom)) - max: MAX((TD_STORE_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Atomic Instructions: - avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom)) - min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom)) - max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - tips: - Write Ack Instructions: - avg: AVG((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) - min: MIN((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) - max: MAX((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1500_address_processing_unit_and_data_return_path_ta_td.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1500_address_processing_unit_and_data_return_path_ta_td.yaml new file mode 100644 index 0000000000..dfe29d7b99 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1500_address_processing_unit_and_data_return_path_ta_td.yaml @@ -0,0 +1,263 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1500 + title: Address Processing Unit and Data Return Path (TA/TD) + metrics_description: + Address Processing Unit Busy: Percent of the total CU cycles the address processor + was busy + Address Stall: Percent of the total CU cycles the address processor was stalled + from sending address requests further into the vL1D pipeline. + Data Stall: Percent of the total CU cycles the address processor was stalled from + sending write/atomic data further into the vL1D pipeline. + "Data-Processor \u2192 Address Stall": Percent of total CU cycles the address + processor was stalled waiting to send command data to the data processor. + Total Instructions: The total number of memory instructions executed by the address + processer over all compute units on the accelerator, per normalization unit. + Global/Generic Instructions: The total number of global & generic memory instructions + executed on all compute units on the accelerator, per normalization unit. + Global/Generic Read Instructions: The total number of global & generic memory + read instructions executed on all compute units on the accelerator, per normalization + unit. + Global/Generic Write Instructions: The total number of global & generic memory + write instructions executed on all compute units on the accelerator, per normalization + unit. + Global/Generic Atomic Instructions: The total number of global & generic memory + atomic (with and without return) instructions executed on all compute units + on the accelerator, per normalization unit. + Spill/Stack Instructions: The total number of spill/stack memory instructions + executed on all compute units on the accelerator, per normalization unit. + Spill/Stack Read Instructions: The total number of spill/stack memory read instructions + executed on all compute units on the accelerator, per normalization unit. + Spill/Stack Write Instructions: The total number of spill/stack memory write instructions + executed on all compute units on the accelerator, per normalization unit. + Spill/Stack Atomic Instructions: The total number of spill/stack memory atomic + (with and without return) instructions executed on all compute units on the + accelerator, per normalization unit. Typically unused as these memory operations + are typically used to implement thread-local storage. + Spill/Stack Total Cycles: The number of cycles the address processing unit spent + working on spill/stack instructions, per normalization unit. + Spill/Stack Coalesced Read: The number of cycles the address processing unit spent + working on coalesced spill/stack read instructions, per normalization unit. + Spill/Stack Coalesced Write: The number of cycles the address processing unit + spent working on coalesced spill/stack write instructions, per normalization + unit. + Data-Return Busy: Percent of the total CU cycles the data-return unit was busy + processing or waiting on data to return to the CU. + "Cache RAM \u2192 Data-Return Stall": Percent of the total CU cycles the data-return + unit was stalled on data to be returned from the vL1D Cache RAM. + "Workgroup manager \u2192 Data-Return Stall": Percent of the total CU cycles the + data-return unit was stalled by the workgroup manager due to initialization + of registers as a part of launching new workgroups. + Coalescable Instructions: The number of instructions submitted to the data-return + unit by the address processor that were found to be coalescable, per normalization + unit. + Read Instructions: The number of read instructions submitted to the data-return + unit by the address processor summed over all compute units on the accelerator, + per normalization unit. This is expected to be the sum of global/generic and + spill/stack reads in the address processor. + Write Instructions: The number of store instructions submitted to the data-return + unit by the address processor summed over all compute units on the accelerator, + per normalization unit. This is expected to be the sum of global/generic and + spill/stack stores in the address processor. + Atomic Instructions: The number of atomic instructions submitted to the data-return + unit by the address processor summed over all compute units on the accelerator, + per normalization unit. This is expected to be the sum of global/generic and + spill/stack atomics in the address processor. + data source: + - metric_table: + id: 1501 + title: Busy and stall metrics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Address Processing Unit Busy: + avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + Address Stall: + avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + unit: pct + Data Stall: + avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + unit: pct + "Data-Processor \u2192 Address Stall": + avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + unit: pct + "Sequencer \u2192 TA Address Stall": + avg: AVG((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) + min: MIN((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) + max: MAX((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) + "Sequencer \u2192 TA Command Stall": + avg: AVG((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) + min: MIN((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) + max: MAX((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) + "Sequencer \u2192 TA Data Stall": + avg: AVG((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) + min: MIN((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) + max: MAX((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) + - metric_table: + id: 1502 + title: Instruction counts + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Total Instructions: + avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom)) + min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom)) + max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Instructions: + avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Read Instructions: + avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Read Instructions for LDS: + avg: AVG((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Write Instructions: + avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Atomic Instructions: + avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Instructions: + avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Read Instructions: + avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Read Instructions for LDS: + avg: AVG((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Write Instructions: + avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Atomic Instructions: + avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + - metric_table: + id: 1503 + title: Spill and stack metrics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Spill/Stack Total Cycles: + avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) + min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) + max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) + unit: (Cycles + $normUnit) + Spill/Stack Coalesced Read: + avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) + min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) + max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) + unit: (Cycles + $normUnit) + Spill/Stack Coalesced Write: + avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) + min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) + max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) + unit: (Cycles + $normUnit) + - metric_table: + id: 1504 + title: Vector L1 data-return path or Texture Data (TD) + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Data-Return Busy: + avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + "Cache RAM \u2192 Data-Return Stall": + avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + "Workgroup manager \u2192 Data-Return Stall": + avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + Coalescable Instructions: + avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom)) + min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom)) + max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) + Read Instructions: + avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) + / $denom)) + min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) + / $denom)) + max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) + / $denom)) + unit: (Instructions + $normUnit) + Write Instructions: + avg: AVG((TD_STORE_WAVEFRONT_sum / $denom)) + min: MIN((TD_STORE_WAVEFRONT_sum / $denom)) + max: MAX((TD_STORE_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) + Atomic Instructions: + avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom)) + min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom)) + max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) + Write Ack Instructions: + avg: AVG((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) + min: MIN((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) + max: MAX((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1600_L1_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1600_L1_cache.yaml deleted file mode 100644 index 2c34924a15..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1600_L1_cache.yaml +++ /dev/null @@ -1,482 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1600 - title: Vector L1 Data Cache - data source: - - metric_table: - id: 1601 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - tips: Tips - metric: - Hit rate: - value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - unit: Pct of Peak - tips: - Bandwidth: - value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) - unit: Pct of Peak - tips: - Utilization: - value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None)) - unit: Pct of Peak - tips: - Coalescing: - value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum - * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None)) - unit: Pct of Peak - tips: - comparable: false # for now - cli_style: simple_bar - - - metric_table: - id: 1602 - title: L1D Cache Stalls (%) - header: - metric: Metric - expr: Expression - tips: Tips - metric: - Stalled on L2 Data: - expr: - (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None) - tips: - Stalled on L2 Req: - expr: - (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None) - tips: - Stalled on Address: - expr: - (((100 * TCP_TCP_TA_ADDR_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None) - tips: - Stalled on Data: - expr: - (((100 * TCP_TCP_TA_DATA_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None) - tips: - Stalled on Latency FIFO: - expr: - (((100 * TCP_LFIFO_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None) - tips: - Stalled on Request FIFO: - expr: - (((100 * TCP_RFIFO_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None) - tips: - Stalled on Read Return: - expr: - (((100 * TCP_TCR_RDRET_STALL_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None) - tips: - Tag RAM Stall (Read): - expr: - (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None) - tips: - Tag RAM Stall (Write): - expr: - (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None) - tips: - Tag RAM Stall (Atomic): - expr: - (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None) - tips: - cli_style: simple_box - - - metric_table: - id: 1603 - title: L1D Cache Accesses - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Total Req: - avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom)) - min: MIN((TCP_TOTAL_ACCESSES_sum / $denom)) - max: MAX((TCP_TOTAL_ACCESSES_sum / $denom)) - unit: (Req + $normUnit) - tips: - Read Req: - avg: AVG((TCP_TOTAL_READ_sum / $denom)) - min: MIN((TCP_TOTAL_READ_sum / $denom)) - max: MAX((TCP_TOTAL_READ_sum / $denom)) - unit: (Req + $normUnit) - tips: - Write Req: - avg: AVG((TCP_TOTAL_WRITE_sum / $denom)) - min: MIN((TCP_TOTAL_WRITE_sum / $denom)) - max: MAX((TCP_TOTAL_WRITE_sum / $denom)) - unit: (Req + $normUnit) - tips: - Atomic Req: - avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)) - min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)) - max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)) - unit: (Req + $normUnit) - tips: - Cache BW: - avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / $denom)) - min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / $denom)) - max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / $denom)) - unit: (Bytes + $normUnit) - tips: - Cache Hit Rate: - avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / - TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / - TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / - TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else - None)) - unit: pct - tips: - Cache Accesses: - avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) - min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) - max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) - unit: (Req + $normUnit) - tips: - Cache Hits: - avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / $denom)) - min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / $denom)) - max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / $denom)) - unit: (Req + $normUnit) - tips: - Invalidations: - avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) - min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) - max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) - unit: (Req + $normUnit) - tips: - L1-L2 BW: - avg: AVG(((128 * TCP_TCC_READ_REQ_sum + 64 * - (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + - TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) - min: MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * - (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + - TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) - max: MAX(((128 * TCP_TCC_READ_REQ_sum + 64 * - (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + - TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) - unit: (Bytes + $normUnit) - tips: - Tag RAM 0 Req: - avg: AVG((TCP_TAGRAM0_REQ_sum / $denom)) - min: MIN((TCP_TAGRAM0_REQ_sum / $denom)) - max: MAX((TCP_TAGRAM0_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - Tag RAM 1 Req: - avg: AVG((TCP_TAGRAM1_REQ_sum / $denom)) - min: MIN((TCP_TAGRAM1_REQ_sum / $denom)) - max: MAX((TCP_TAGRAM1_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - Tag RAM 2 Req: - avg: AVG((TCP_TAGRAM2_REQ_sum / $denom)) - min: MIN((TCP_TAGRAM2_REQ_sum / $denom)) - max: MAX((TCP_TAGRAM2_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - Tag RAM 3 Req: - avg: AVG((TCP_TAGRAM3_REQ_sum / $denom)) - min: MIN((TCP_TAGRAM3_REQ_sum / $denom)) - max: MAX((TCP_TAGRAM3_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - L1-L2 Read: - avg: AVG((TCP_TCC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - L1-L2 Write: - avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - L1-L2 Atomic: - avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)) - min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)) - max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)) - unit: (Req + $normUnit) - tips: - L1 Access Latency: - avg: AVG((TCP_TCP_LATENCY_sum / $denom)) - min: MIN((TCP_TCP_LATENCY_sum / $denom)) - max: MAX((TCP_TCP_LATENCY_sum / $denom)) - unit: (Cycles + $normUnit) - tips: - L1-L2 Read Latency: - avg: AVG((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) - min: MIN((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) - max: MAX((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) - unit: (Cycles + $normUnit) - tips: - L1-L2 Write Latency: - avg: AVG((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) - min: MIN((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) - max: MAX((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) - unit: (Cycles + $normUnit) - tips: - - - metric_table: - id: 1604 - title: L1D - L2 Transactions - header: - metric: Metric - xfer: Xfer - coherency: Coherency - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - NC - Read: - xfer: Read - coherency: NC - avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - UC - Read: - xfer: Read - coherency: UC - avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - CC - Read: - xfer: Read - coherency: CC - avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - RW - Read: - xfer: Read - coherency: RW - avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - RW - Write: - xfer: Write - coherency: RW - avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - NC - Write: - xfer: Write - coherency: NC - avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - UC - Write: - xfer: Write - coherency: UC - avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - CC - Write: - xfer: Write - coherency: CC - avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - NC - Atomic: - xfer: Atomic - coherency: NC - avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - UC - Atomic: - xfer: Atomic - coherency: UC - avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - CC - Atomic: - xfer: Atomic - coherency: CC - avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - RW - Atomic: - xfer: Atomic - coherency: RW - avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - - - metric_table: - id: 1605 - title: L1D Addr Translation - header: - metric: Metric - avg: Avg - min: Min - max: Max - units: Units - tips: Tips - metric: - Req: - avg: AVG((TCP_UTCL1_REQUEST_sum / $denom)) - min: MIN((TCP_UTCL1_REQUEST_sum / $denom)) - max: MAX((TCP_UTCL1_REQUEST_sum / $denom)) - units: (Req + $normUnit) - tips: - Inflight Req: - avg: AVG((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom)) - min: MIN((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom)) - max: MAX((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom)) - units: (Req + $normUnit) - tips: - Hit Ratio: - avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) if - (TCP_UTCL1_REQUEST_sum != 0) else None)) - min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) if - (TCP_UTCL1_REQUEST_sum != 0) else None)) - max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) if - (TCP_UTCL1_REQUEST_sum != 0) else None)) - units: pct - tips: - Hits: - avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) - min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) - max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) - units: (Req + $normUnit) - tips: - Translation Misses: - avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) - min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) - max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) - units: (Req + $normUnit) - tips: - Misses under Translation Miss: - avg: AVG((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom)) - min: MIN((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom)) - max: MAX((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom)) - units: (Req + $normUnit) - tips: - Permission Misses: - avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) - min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) - max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) - units: (Req + $normUnit) - tips: - - metric_table: - id: 1606 - title: L1D Addr Translation Stalls - header: - metric: Metric - avg: Avg - min: Min - max: Max - units: Units - tips: Tips - metric: - Cache Full Stall: - avg: AVG((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom)) - min: MIN((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom)) - max: MAX((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom)) - units: (Cycles + $normUnit) - tips: - Cache Miss Stall: - avg: AVG((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom)) - min: MIN((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom)) - max: MAX((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom)) - units: (Cycles + $normUnit) - tips: - Serialization Stall: - avg: AVG((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom)) - min: MIN((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom)) - max: MAX((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom)) - units: (Cycles + $normUnit) - tips: - Thrashing Stall: - avg: AVG((TCP_UTCL1_THRASHING_STALL_sum / $denom)) - min: MIN((TCP_UTCL1_THRASHING_STALL_sum / $denom)) - max: MAX((TCP_UTCL1_THRASHING_STALL_sum / $denom)) - units: (Cycles + $normUnit) - tips: - Latency FIFO Stall: - avg: AVG((TCP_UTCL1_LFIFO_FULL_sum / $denom)) - min: MIN((TCP_UTCL1_LFIFO_FULL_sum / $denom)) - max: MAX((TCP_UTCL1_LFIFO_FULL_sum / $denom)) - units: (Cycles + $normUnit) - tips: - Resident Page Full Stall: - avg: AVG((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom)) - min: MIN((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom)) - max: MAX((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom)) - units: (Cycles + $normUnit) - tips: - UTCL2 Stall: - avg: AVG((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom)) - min: MIN((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom)) - max: MAX((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom)) - units: (Cycles + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1600_vector_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1600_vector_l1_data_cache.yaml new file mode 100644 index 0000000000..a196aa64f0 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1600_vector_l1_data_cache.yaml @@ -0,0 +1,507 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1600 + title: Vector L1 Data Cache + metrics_description: + Hit rate: The ratio of the number of vL1D cache line requests that hit in vL1D + cache over the total number of cache line requests to the vL1D Cache RAM. + Bandwidth: The number of bytes looked up in the vL1D cache as a result of VMEM + instructions, as a percent of the peak theoretical bandwidth achievable on the + specific accelerator. The number of bytes is calculated as the number of cache + lines requested multiplied by the cache line size. This value does not consider + partial requests, so for instance, if only a single value is requested in a + cache line, the data movement will still be counted as a full cache line. + Utilization: Indicates how busy the vL1D Cache RAM was during the kernel execution. + The number of cycles where the vL1D Cache RAM is actively processing any request + divided by the number of cycles where the vL1D is active. + Coalescing: Indicates how well memory instructions were coalesced by the address + processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). Calculated + as the average number of thread-requests generated per instruction divided by + the ideal number of thread-requests per instruction. + Stalled on L2 Data: The ratio of the number of cycles where the vL1D is stalled + waiting for requested data to return from the L2 cache divided by the number + of cycles where the vL1D is active. + Stalled on L2 Req: The ratio of the number of cycles where the vL1D is stalled + waiting to issue a request for data to the L2 cache divided by the number of + cycles where the vL1D is active. + Tag RAM Stall (Read): The ratio of the number of cycles where the vL1D is stalled + due to Read requests with conflicting tags being looked up concurrently, divided + by the number of cycles where the vL1D is active. + Tag RAM Stall (Write): The ratio of the number of cycles where the vL1D is stalled + due to Write requests with conflicting tags being looked up concurrently, divided + by the number of cycles where the vL1D is active. + Tag RAM Stall (Atomic): The ratio of the number of cycles where the vL1D is stalled + due to Atomic requests with conflicting tags being looked up concurrently, divided + by the number of cycles where the vL1D is active. + Total Req: The total number of incoming requests from the address processing unit + after coalescing. + Read Req: The total number of incoming read requests from the address processing + unit after coalescing per normalization unit. + Write Req: The total number of incoming write requests from the address processing + unit after coalescing per normalization unit. + Atomic Req: The total number of incoming atomic requests from the address processing + unit after coalescing per normalization unit. + Cache BW: The number of bytes looked up in the vL1D cache as a result of VMEM + instructions per normalization unit. The number of bytes is calculated as the + number of cache lines requested multiplied by the cache line size. This value + does not consider partial requests, so for instance, if only a single value + is requested in a cache line, the data movement will still be counted as a full + cache line. + Cache Hit Rate: The ratio of the number of vL1D cache line requests that hit in + vL1D cache over the total number of cache line requests to the vL1D Cache RAM. + Cache Accesses: The total number of cache line lookups in the vL1D. + Cache Hits: The number of cache accesses minus the number of outgoing requests + to the L2 cache, that is, the number of cache line requests serviced by the + vL1D Cache RAM per normalization unit. + Invalidations: The number of times the vL1D was issued a write-back invalidate + command during the kernel's execution per normalization unit. This may be triggered + by, for instance, the buffer_wbinvl1 instruction. + L1-L2 BW: The number of bytes transferred across the vL1D-L2 interface as a result + of VMEM instructions, per normalization unit. The number of bytes is calculated + as the number of cache lines requested multiplied by the cache line size. This + value does not consider partial requests, so for instance, if only a single + value is requested in a cache line, the data movement will still be counted + as a full cache line. + L1-L2 Read: The number of read requests for a vL1D cache line that were not satisfied + by the vL1D and must be retrieved from the to the L2 Cache per normalization + unit. + L1-L2 Write: The number of write requests to a vL1D cache line that were sent + through the vL1D to the L2 cache, per normalization unit. + L1-L2 Atomic: The number of atomic requests that are sent through the vL1D to + the L2 cache, per normalization unit. This includes requests for atomics with, + and without return. + L1 Access Latency: Calculated as the average number of cycles that a vL1D cache + line request spent in the vL1D cache pipeline. + L1-L2 Read Latency: Calculated as the average number of cycles that the vL1D cache + took to issue and receive read requests from the L2 Cache. This number also + includes requests for atomics with return values. + L1-L2 Write Latency: Calculated as the average number of cycles that the vL1D + cache took to issue and receive acknowledgement of a write request to the L2 + Cache. This number also includes requests for atomics without return values. + NC - Read: Total read requests with NC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + UC - Read: Total read requests with UC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + CC - Read: Total read requests with CC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + RW - Read: Total read requests with RW mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + RW - Write: Total write requests with RW mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + NC - Write: Total write requests with NC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + UC - Write: Total write requests with UC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + CC - Write: Total write requests with CC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + NC - Atomic: Total atomic requests with NC mtype from this TCP to all TCCs Sum + over TCP instances per normalization unit. + UC - Atomic: Total atomic requests with UC mtype from this TCP to all TCCs Sum + over TCP instances per normalization unit. + CC - Atomic: Total atomic requests with CC mtype from this TCP to all TCCs Sum + over TCP instances per normalization unit. + RW - Atomic: Total atomic requests with RW mtype from this TCP to all TCCs Sum + over TCP instances per normalization unit. + Req: The number of translation requests made to the UTCL1 per normalization unit. + Hit Ratio: The ratio of the number of translation requests that hit in the UTCL1 + divided by the total number of translation requests made to the UTCL1. + Hits: The number of translation requests that hit in the UTCL1, and could be reused, + per normalization unit. + Translation Misses: The total number of translation requests that missed in the + UTCL1 due to translation not being present in the cache, per normalization + unit. + Permission Misses: "The total number of translation requests that missed in the\ + \ UTCL1 due to a permission error, per normalization unit. This is unused and\ + \ expected to be zero in most configurations for modern CDNA\u2122 accelerators." + data source: + - metric_table: + id: 1601 + title: vL1D Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + metric: + Hit rate: + value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None)) + unit: Pct of Peak + Bandwidth: + value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) + unit: Pct of Peak + Utilization: + value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None)) + unit: Pct of Peak + Coalescing: + value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum + * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None)) + unit: Pct of Peak + comparable: false + cli_style: simple_bar + tui_style: simple_bar + - metric_table: + id: 1602 + title: vL1D cache stall metrics + header: + metric: Metric + expr: Expression + metric: + Stalled on L2 Data: + expr: (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None) + Stalled on L2 Req: + expr: (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None) + Stalled on Address: + expr: (((100 * TCP_TCP_TA_ADDR_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if + (TCP_GATE_EN1_sum != 0) else None) + Stalled on Data: + expr: (((100 * TCP_TCP_TA_DATA_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if + (TCP_GATE_EN1_sum != 0) else None) + Stalled on Latency FIFO: + expr: (((100 * TCP_LFIFO_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None) + Stalled on Request FIFO: + expr: (((100 * TCP_RFIFO_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None) + Stalled on Read Return: + expr: (((100 * TCP_TCR_RDRET_STALL_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None) + Tag RAM Stall (Read): + expr: (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + Tag RAM Stall (Write): + expr: (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + Tag RAM Stall (Atomic): + expr: (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1603 + title: vL1D cache access metrics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Total Req: + avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom)) + min: MIN((TCP_TOTAL_ACCESSES_sum / $denom)) + max: MAX((TCP_TOTAL_ACCESSES_sum / $denom)) + unit: (Req + $normUnit) + Read Req: + avg: AVG((TCP_TOTAL_READ_sum / $denom)) + min: MIN((TCP_TOTAL_READ_sum / $denom)) + max: MAX((TCP_TOTAL_READ_sum / $denom)) + unit: (Req + $normUnit) + Write Req: + avg: AVG((TCP_TOTAL_WRITE_sum / $denom)) + min: MIN((TCP_TOTAL_WRITE_sum / $denom)) + max: MAX((TCP_TOTAL_WRITE_sum / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + unit: (Req + $normUnit) + Cache BW: + avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / $denom)) + min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / $denom)) + max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / $denom)) + unit: (Bytes + $normUnit) + Cache Hit Rate: + avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None)) + min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None)) + max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) + else None)) + unit: pct + Cache Accesses: + avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + unit: (Req + $normUnit) + Cache Hits: + avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + unit: (Req + $normUnit) + Invalidations: + avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + unit: (Req + $normUnit) + L1-L2 BW: + avg: AVG(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) + min: MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) + max: MAX(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) + unit: (Bytes + $normUnit) + Tag RAM 0 Req: + avg: AVG((TCP_TAGRAM0_REQ_sum / $denom)) + min: MIN((TCP_TAGRAM0_REQ_sum / $denom)) + max: MAX((TCP_TAGRAM0_REQ_sum / $denom)) + unit: (Req + $normUnit) + Tag RAM 1 Req: + avg: AVG((TCP_TAGRAM1_REQ_sum / $denom)) + min: MIN((TCP_TAGRAM1_REQ_sum / $denom)) + max: MAX((TCP_TAGRAM1_REQ_sum / $denom)) + unit: (Req + $normUnit) + Tag RAM 2 Req: + avg: AVG((TCP_TAGRAM2_REQ_sum / $denom)) + min: MIN((TCP_TAGRAM2_REQ_sum / $denom)) + max: MAX((TCP_TAGRAM2_REQ_sum / $denom)) + unit: (Req + $normUnit) + Tag RAM 3 Req: + avg: AVG((TCP_TAGRAM3_REQ_sum / $denom)) + min: MIN((TCP_TAGRAM3_REQ_sum / $denom)) + max: MAX((TCP_TAGRAM3_REQ_sum / $denom)) + unit: (Req + $normUnit) + L1-L2 Read: + avg: AVG((TCP_TCC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + L1-L2 Write: + avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + L1-L2 Atomic: + avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + unit: (Req + $normUnit) + L1 Access Latency: + avg: AVG((TCP_TCP_LATENCY_sum / $denom)) + min: MIN((TCP_TCP_LATENCY_sum / $denom)) + max: MAX((TCP_TCP_LATENCY_sum / $denom)) + unit: (Cycles + $normUnit) + L1-L2 Read Latency: + avg: AVG((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) + min: MIN((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) + max: MAX((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) + unit: (Cycles + $normUnit) + L1-L2 Write Latency: + avg: AVG((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) + min: MIN((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) + max: MAX((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) + unit: (Cycles + $normUnit) + - metric_table: + id: 1604 + title: L1D - L2 Transactions + header: + metric: Metric + xfer: Xfer + coherency: Coherency + avg: Avg + min: Min + max: Max + unit: Unit + metric: + NC - Read: + xfer: Read + coherency: NC + avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC - Read: + xfer: Read + coherency: UC + avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC - Read: + xfer: Read + coherency: CC + avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW - Read: + xfer: Read + coherency: RW + avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW - Write: + xfer: Write + coherency: RW + avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + NC - Write: + xfer: Write + coherency: NC + avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC - Write: + xfer: Write + coherency: UC + avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC - Write: + xfer: Write + coherency: CC + avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + NC - Atomic: + xfer: Atomic + coherency: NC + avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC - Atomic: + xfer: Atomic + coherency: UC + avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC - Atomic: + xfer: Atomic + coherency: CC + avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW - Atomic: + xfer: Atomic + coherency: RW + avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + - metric_table: + id: 1605 + title: L1 Unified Translation Cache (UTCL1) + header: + metric: Metric + avg: Avg + min: Min + max: Max + units: Units + metric: + Req: + avg: AVG((TCP_UTCL1_REQUEST_sum / $denom)) + min: MIN((TCP_UTCL1_REQUEST_sum / $denom)) + max: MAX((TCP_UTCL1_REQUEST_sum / $denom)) + units: (Req + $normUnit) + Inflight Req: + avg: AVG((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom)) + min: MIN((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom)) + max: MAX((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom)) + units: (Req + $normUnit) + Hit Ratio: + avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) + if (TCP_UTCL1_REQUEST_sum != 0) else None)) + min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) + if (TCP_UTCL1_REQUEST_sum != 0) else None)) + max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) + if (TCP_UTCL1_REQUEST_sum != 0) else None)) + units: pct + Hits: + avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + units: (Req + $normUnit) + Translation Misses: + avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + units: (Req + $normUnit) + Misses under Translation Miss: + avg: AVG((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom)) + min: MIN((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom)) + max: MAX((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom)) + units: (Req + $normUnit) + Permission Misses: + avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + units: (Req + $normUnit) + - metric_table: + id: 1606 + title: L1D Addr Translation Stalls + header: + metric: Metric + avg: Avg + min: Min + max: Max + units: Units + metric: + Cache Full Stall: + avg: AVG((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom)) + min: MIN((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom)) + max: MAX((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom)) + units: (Cycles + $normUnit) + Cache Miss Stall: + avg: AVG((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom)) + min: MIN((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom)) + max: MAX((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom)) + units: (Cycles + $normUnit) + Serialization Stall: + avg: AVG((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom)) + min: MIN((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom)) + max: MAX((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom)) + units: (Cycles + $normUnit) + Thrashing Stall: + avg: AVG((TCP_UTCL1_THRASHING_STALL_sum / $denom)) + min: MIN((TCP_UTCL1_THRASHING_STALL_sum / $denom)) + max: MAX((TCP_UTCL1_THRASHING_STALL_sum / $denom)) + units: (Cycles + $normUnit) + Latency FIFO Stall: + avg: AVG((TCP_UTCL1_LFIFO_FULL_sum / $denom)) + min: MIN((TCP_UTCL1_LFIFO_FULL_sum / $denom)) + max: MAX((TCP_UTCL1_LFIFO_FULL_sum / $denom)) + units: (Cycles + $normUnit) + Resident Page Full Stall: + avg: AVG((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom)) + min: MIN((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom)) + max: MAX((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom)) + units: (Cycles + $normUnit) + UTCL2 Stall: + avg: AVG((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom)) + min: MIN((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom)) + max: MAX((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom)) + units: (Cycles + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1700_L2_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1700_L2_cache.yaml deleted file mode 100644 index faf9664766..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1700_L2_cache.yaml +++ /dev/null @@ -1,553 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1700 - title: L2 Cache - data source: - - metric_table: - id: 1701 - title: Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - tips: Tips - metric: - Utilization: - value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD))) - unit: pct - tips: - Bandwidth: - value: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) - unit: pct - tips: - Hit Rate: - value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else 0)) - unit: pct - tips: - L2-Fabric Read BW: - value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum - * 128)) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - tips: - L2-Fabric Write and Atomic BW: - value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - tips: - HBM Bandwidth: - value: $hbmBandwidth - unit: GB/s - tips: - - - metric_table: - id: 1702 - title: L2 - Fabric Transactions - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Read BW: - avg: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / $denom)) - min: MIN((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / $denom)) - max: MAX((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / $denom)) - unit: (Bytes + $normUnit) - tips: - HBM Read Traffic: - avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - unit: pct - tips: - Remote Read Traffic: - avg: AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - min: MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - max: MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - unit: pct - tips: - Uncached Read Traffic: - avg: AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - unit: pct - tips: - Write and Atomic BW: - avg: - AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / $denom)) - min: - MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / $denom)) - max: - MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / $denom)) - unit: (Bytes + $normUnit) - tips: - HBM Write and Atomic Traffic: - avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - unit: pct - tips: - Remote Write and Atomic Traffic: - avg: AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - min: MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - max: MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - unit: pct - tips: - Atomic Traffic: - avg: AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - unit: pct - tips: - Uncached Write and Atomic Traffic: - avg: AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - min: MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - max: MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - unit: pct - tips: - Read Latency: - avg: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != - 0) else None)) - min: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != - 0) else None)) - max: MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != - 0) else None)) - unit: Cycles - tips: - Write and Atomic Latency: - avg: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != - 0) else None)) - min: MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != - 0) else None)) - max: MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != - 0) else None)) - unit: Cycles - tips: - Atomic Latency: - avg: AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum - != 0) else None)) - min: MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum - != 0) else None)) - max: MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum - != 0) else None)) - unit: Cycles - tips: - Read Stall: - avg: AVG((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum) - + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != - 0) else None)) - min: MIN((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum) - + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != - 0) else None)) - max: MAX((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum) - + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != - 0) else None)) - unit: pct - tips: - Write Stall: - avg: AVG(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum != - 0) else None)) - min: MIN(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum != - 0) else None)) - max: MAX(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum != - 0) else None)) - unit: pct - tips: - - - metric_table: - id: 1703 - title: L2 Cache Accesses - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Bandwidth: - avg: AVG((TCC_REQ_sum * 128) / $denom) - min: MIN((TCC_REQ_sum * 128) / $denom) - max: MAX((TCC_REQ_sum * 128) / $denom) - unit: (Bytes + $normUnit) - tips: - Read Bandwidth: - avg: AVG(TCC_READ_SECTORS_sum * 32 / $denom) - min: MIN(TCC_READ_SECTORS_sum * 32 / $denom) - max: MAX(TCC_READ_SECTORS_sum * 32 / $denom) - unit: (Bytes + $normUnit) - tips: - Write Bandwidth: - avg: AVG(TCC_WRITE_SECTORS_sum * 32 / $denom) - min: MIN(TCC_WRITE_SECTORS_sum * 32 / $denom) - max: MAX(TCC_WRITE_SECTORS_sum * 32 / $denom) - unit: (Bytes + $normUnit) - tips: - Atomic Bandwidth: - avg: AVG(TCC_ATOMIC_SECTORS_sum * 32 / $denom) - min: MIN(TCC_ATOMIC_SECTORS_sum * 32 / $denom) - max: MAX(TCC_ATOMIC_SECTORS_sum * 32 / $denom) - unit: (Bytes + $normUnit) - tips: - Req: - avg: AVG((TCC_REQ_sum / $denom)) - min: MIN((TCC_REQ_sum / $denom)) - max: MAX((TCC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - Read Req: - avg: AVG((TCC_READ_sum / $denom)) - min: MIN((TCC_READ_sum / $denom)) - max: MAX((TCC_READ_sum / $denom)) - unit: (Req + $normUnit) - tips: - Write Req: - avg: AVG((TCC_WRITE_sum / $denom)) - min: MIN((TCC_WRITE_sum / $denom)) - max: MAX((TCC_WRITE_sum / $denom)) - unit: (Req + $normUnit) - tips: - Atomic Req: - avg: AVG((TCC_ATOMIC_sum / $denom)) - min: MIN((TCC_ATOMIC_sum / $denom)) - max: MAX((TCC_ATOMIC_sum / $denom)) - unit: (Req + $normUnit) - tips: - Streaming Req: - avg: AVG((TCC_STREAMING_REQ_sum / $denom)) - min: MIN((TCC_STREAMING_REQ_sum / $denom)) - max: MAX((TCC_STREAMING_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - Bypasss Req: - avg: AVG((TCC_BYPASS_REQ_sum / $denom)) - min: MIN((TCC_BYPASS_REQ_sum / $denom)) - max: MAX((TCC_BYPASS_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - Probe Req: - avg: AVG((TCC_PROBE_sum / $denom)) - min: MIN((TCC_PROBE_sum / $denom)) - max: MAX((TCC_PROBE_sum / $denom)) - unit: (Req + $normUnit) - tips: - Input Buffer Req: - avg: AVG((TCC_IB_REQ_sum / $denom)) - min: MIN((TCC_IB_REQ_sum / $denom)) - max: MAX((TCC_IB_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - Cache Hit: - avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - unit: pct - tips: - Hits: - avg: AVG((TCC_HIT_sum / $denom)) - min: MIN((TCC_HIT_sum / $denom)) - max: MAX((TCC_HIT_sum / $denom)) - unit: (Hits + $normUnit) - tips: - Misses: - avg: AVG((TCC_MISS_sum / $denom)) - min: MIN((TCC_MISS_sum / $denom)) - max: MAX((TCC_MISS_sum / $denom)) - unit: (Misses + $normUnit) - tips: - Writeback: - avg: AVG((TCC_WRITEBACK_sum / $denom)) - min: MIN((TCC_WRITEBACK_sum / $denom)) - max: MAX((TCC_WRITEBACK_sum / $denom)) - unit: (Cachelines + $normUnit) - tips: - Writeback (Internal): - avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom)) - min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom)) - max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom)) - unit: (Cachelines + $normUnit) - tips: - Writeback (vL1D Req): - avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - unit: (Cachelines + $normUnit) - tips: - Evict (Internal): - avg: AVG((TCC_NORMAL_EVICT_sum / $denom)) - min: MIN((TCC_NORMAL_EVICT_sum / $denom)) - max: MAX((TCC_NORMAL_EVICT_sum / $denom)) - unit: (Cachelines + $normUnit) - tips: - Evict (vL1D Req): - avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - unit: (Cachelines + $normUnit) - tips: - NC Req: - avg: AVG((TCC_NC_REQ_sum / $denom)) - min: MIN((TCC_NC_REQ_sum / $denom)) - max: MAX((TCC_NC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - UC Req: - avg: AVG((TCC_UC_REQ_sum / $denom)) - min: MIN((TCC_UC_REQ_sum / $denom)) - max: MAX((TCC_UC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - CC Req: - avg: AVG((TCC_CC_REQ_sum / $denom)) - min: MIN((TCC_CC_REQ_sum / $denom)) - max: MAX((TCC_CC_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - RW Req: - avg: AVG((TCC_RW_REQ_sum / $denom)) - min: MIN((TCC_RW_REQ_sum / $denom)) - max: MAX((TCC_RW_REQ_sum / $denom)) - unit: (Req + $normUnit) - tips: - - - metric_table: - id: 1704 - title: L2 Cache Stalls - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Stalled on Latency FIFO: - avg: AVG(TCC_LATENCY_FIFO_FULL_sum / $denom) - min: MIN(TCC_LATENCY_FIFO_FULL_sum / $denom) - max: MAX(TCC_LATENCY_FIFO_FULL_sum / $denom) - unit: (Cycles + $normUnit) - tips: - Stalled on Write Data FIFO: - avg: AVG(TCC_SRC_FIFO_FULL_sum / $denom) - min: MIN(TCC_SRC_FIFO_FULL_sum / $denom) - max: MAX(TCC_SRC_FIFO_FULL_sum / $denom) - unit: (Cycles + $normUnit) - tips: - Input Buffer Stalled on L2: - avg: AVG(TCC_IB_STALL_sum / $denom) - min: MIN(TCC_IB_STALL_sum / $denom) - max: MAX(TCC_IB_STALL_sum / $denom) - unit: (Cycles + $normUnit) - tips: - - - metric_table: - id: 1705 - title: L2 - Fabric Interface Stalls - header: - metric: Metric - type: Type - transaction: Transaction - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - style: - type: simple_multi_bar - metric: - Read - PCIe Stall: - type: PCIe Stall - transaction: Read - avg: AVG(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - min: MIN(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - max: MAX(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - unit: pct - tips: - Read - Infinity Fabric™ Stall: - type: Infinity Fabric™ Stall - transaction: Read - avg: AVG(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - min: MIN(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - max: MAX(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - unit: pct - tips: - Read - HBM Stall: - type: HBM Stall - transaction: Read - avg: AVG(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - min: MIN(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - max: MAX(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - unit: pct - tips: - Write - PCIe Stall: - type: PCIe Stall - transaction: Write - avg: AVG(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - min: MIN(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - max: MAX(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - unit: pct - tips: - Write - Infinity Fabric™ Stall: - type: Infinity Fabric™ Stall - transaction: Write - avg: AVG(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - min: MIN(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - max: MAX(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - unit: pct - tips: - Write - HBM Stall: - type: HBM Stall - transaction: Write - avg: AVG(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - min: MIN(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - max: MAX(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - unit: pct - tips: - Write - Credit Starvation: - type: Credit Starvation - transaction: Write - avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - unit: pct - tips: - - - metric_table: - id: 1706 - title: L2 - Fabric Detailed Transaction Breakdown - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - tips: Tips - metric: - Read (32B): - avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom)) - min: MIN((TCC_EA0_RDREQ_32B_sum / $denom)) - max: MAX((TCC_EA0_RDREQ_32B_sum / $denom)) - unit: (Req + $normUnit) - tips: - Read (64B): - avg: AVG((TCC_EA0_RDREQ_64B_sum / $denom)) - min: MIN((TCC_EA0_RDREQ_64B_sum / $denom)) - max: MAX((TCC_EA0_RDREQ_64B_sum / $denom)) - unit: (Req + $normUnit) - tips: - Read (128B): - avg: AVG((TCC_EA0_RDREQ_128B_sum / $denom)) - min: MIN((TCC_EA0_RDREQ_128B_sum / $denom)) - max: MAX((TCC_EA0_RDREQ_128B_sum / $denom)) - unit: (Req + $normUnit) - tips: - Read (Uncached): - avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) - min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) - max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) - unit: (Req + $normUnit) - tips: - HBM Read: - avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)) - min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom)) - max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom)) - unit: (Req + $normUnit) - tips: - Remote Read: - avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - unit: (Req + $normUnit) - tips: - Write and Atomic (32B): - avg: AVG(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) - min: MIN(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) - max: MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) - unit: (Req + $normUnit) - tips: - Write and Atomic (Uncached): - avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) - min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) - max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) - unit: (Req + $normUnit) - tips: - Write and Atomic (64B): - avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom)) - min: MIN((TCC_EA0_WRREQ_64B_sum / $denom)) - max: MAX((TCC_EA0_WRREQ_64B_sum / $denom)) - unit: (Req + $normUnit) - tips: - HBM Write and Atomic: - avg: AVG((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) - min: MIN((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) - max: MAX((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) - unit: (Req + $normUnit) - tips: - Remote Write and Atomic: - avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - unit: (Req + $normUnit) - tips: - Write Bandwidth - PCIe: - avg: AVG(TCC_EA0_WRREQ_WRITE_IO_32B_sum / $denom) - min: MIN(TCC_EA0_WRREQ_WRITE_IO_32B_sum / $denom) - max: MAX(TCC_EA0_WRREQ_WRITE_IO_32B_sum / $denom) - unit: (Bytes + $normUnit) - tips: - Write Bandwidth - Infinity Fabric™: - avg: AVG(TCC_EA0_WRREQ_WRITE_GMI_32B_sum / $denom) - min: MIN(TCC_EA0_WRREQ_WRITE_GMI_32B_sum / $denom) - max: MAX(TCC_EA0_WRREQ_WRITE_GMI_32B_sum / $denom) - unit: (Bytes + $normUnit) - tips: - Write Bandwidth - HBM: - avg: AVG(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum / $denom) - min: MIN(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum / $denom) - max: MAX(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum / $denom) - unit: (Bytes + $normUnit) - tips: - Atomic: - avg: AVG((TCC_EA0_ATOMIC_sum / $denom)) - min: MIN((TCC_EA0_ATOMIC_sum / $denom)) - max: MAX((TCC_EA0_ATOMIC_sum / $denom)) - unit: (Req + $normUnit) - tips: - Atomic - HBM: - avg: AVG((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom)) - min: MIN((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom)) - max: MAX((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom)) - unit: (Req + $normUnit) - tips: - Atomic Bandwidth - PCIe: - avg: AVG(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum / $denom) - min: MIN(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum / $denom) - max: MAX(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum / $denom) - unit: (Bytes + $normUnit) - tips: - Atomic Bandwidth - Infinity Fabric™: - avg: AVG(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum / $denom) - min: MIN(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum / $denom) - max: MAX(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum / $denom) - unit: (Bytes + $normUnit) - tips: - Atomic Bandwidth - HBM: - avg: AVG(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum / $denom) - min: MIN(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum / $denom) - max: MAX(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum / $denom) - unit: (Bytes + $normUnit) - tips: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1700_l2_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1700_l2_cache.yaml new file mode 100644 index 0000000000..85abb7d025 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1700_l2_cache.yaml @@ -0,0 +1,695 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1700 + title: L2 Cache + metrics_description: + Utilization: The ratio of the number of cycles an L2 channel was active, summed + over all L2 channels on the accelerator over the total L2 cycles. + Peak Bandwidth: The number of bytes looked up in the L2 cache, as a percent of + the peak theoretical bandwidth achievable on the specific accelerator. The number + of bytes is calculated as the number of cache lines requested multiplied by + the cache line size. This value does not consider partial requests, so e.g., + if only a single value is requested in a cache line, the data movement will + still be counted as a full cache line. + Hit Rate: The ratio of the number of L2 cache line requests that hit in the L2 + cache over the total number of incoming cache line requests to the L2 cache. + L2-Fabric Read BW: The number of bytes read by the L2 over the Infinity Fabric + interface per unit time. + L2-Fabric Write and Atomic BW: The number of bytes sent by the L2 over the Infinity + Fabric interface by write and atomic operations per unit time. + HBM Bandwidth: Maximum theoretical bandwidth of the accelerator's local high-bandwidth + memory (HBM) per unit time. This value is calculated as the number of HBM channels + multiplied by the HBM channel width multiplied by the HBM clock frequency. + Read BW: The total number of bytes read by the L2 cache from Infinity Fabric per + normalization unit. + HBM Read Traffic: The percent of read requests generated by the L2 cache that + are routed to the accelerator's local high-bandwidth memory (HBM). This breakdown + does not consider the size of the request (meaning that 32B and 64B requests + are both counted as a single request), so this metric only approximates the + percent of the L2-Fabric Read bandwidth directed to the local HBM. + Remote Read Traffic: The percent of read requests generated by the L2 cache that + are routed to any memory location other than the accelerator's local high-bandwidth + memory (HBM) - for example, the CPU's DRAM or a remote accelerator's HBM. This + breakdown does not consider the size of the request (meaning that 32B and 64B + requests are both counted as a single request), so this metric only approximates + the percent of the L2-Fabric Read bandwidth directed to a remote location. + Uncached Read Traffic: The percent of read requests generated by the L2 cache + that are reading from an uncached memory allocation. Note, as described in the + request flow section, a single 64B read request is typically counted as two + uncached read requests. So, it is possible for the Uncached Read Traffic to + reach up to 200% of the total number of read requests. This breakdown does not + consider the size of the request (i.e., 32B and 64B requests are both counted + as a single request), so this metric only approximates the percent of the L2-Fabric + read bandwidth directed to an uncached memory location. + Write and Atomic BW: The total number of bytes written by the L2 over Infinity + Fabric by write and atomic operations per normalization unit. Note that on current + CDNA accelerators, such as the MI2XX, requests are only considered atomic by + Infinity Fabric if they are targeted at non-write-cacheable memory, for example, + fine-grained memory allocations or uncached memory allocations on the MI2XX. + HBM Write and Atomic Traffic: The percent of write and atomic requests generated + by the L2 cache that are routed to the accelerator's local high-bandwidth memory + (HBM). This breakdown does not consider the size of the request (meaning that + 32B and 64B requests are both counted as a single request), so this metric only + approximates the percent of the L2-Fabric Write and Atomic bandwidth directed + to the local HBM. Note that on current CDNA accelerators, such as the MI2XX, + requests are only considered atomic by Infinity Fabric if they are targeted + at fine-grained memory allocations or uncached memory allocations. + Remote Write and Atomic Traffic: The percent of read requests generated by the + L2 cache that are routed to any memory location other than the accelerator's + local high-bandwidth memory (HBM) - for example, the CPU's DRAM or a remote + accelerator's HBM. This breakdown does not consider the size of the request + (meaning that 32B and 64B requests are both counted as a single request), so + this metric only approximates the percent of the L2-Fabric Read bandwidth directed + to a remote location. Note that on current CDNA accelerators, such as the MI2XX, + requests are only considered atomic by Infinity Fabric if they are targeted + at fine-grained memory allocations or uncached memory allocations. + Atomic Traffic: The percent of write requests generated by the L2 cache that are + atomic requests to any memory location. This breakdown does not consider the + size of the request (meaning that 32B and 64B requests are both counted as a + single request), so this metric only approximates the percent of the L2-Fabric + Read bandwidth directed to a remote location. Note that on current CDNA accelerators, + such as the MI2XX, requests are only considered atomic by Infinity Fabric if + they are targeted at fine-grained memory allocations or uncached memory allocations. + Uncached Write and Atomic Traffic: The percent of write and atomic requests generated + by the L2 cache that are targeting uncached memory allocations. This breakdown + does not consider the size of the request (meaning that 32B and 64B requests + are both counted as a single request), so this metric only approximates the + percent of the L2-Fabric read bandwidth directed to uncached memory allocations. + Read Latency: The time-averaged number of cycles read requests spent in Infinity + Fabric before data was returned to the L2. + Write and Atomic Latency: The time-averaged number of cycles write requests spent + in Infinity Fabric before a completion acknowledgement was returned to the L2. + Atomic Latency: The time-averaged number of cycles atomic requests spent in Infinity + Fabric before a completion acknowledgement (atomic without return value) or + data (atomic with return value) was returned to the L2. + Bandwidth: The number of bytes looked up in the L2 cache, per normalization unit. + The number of bytes is calculated as the number of cache lines requested multiplied + by the cache line size. This value does not consider partial requests, so for + example, if only a single value is requested in a cache line, the data movement + will still be counted as a full cache line. + Req: The total number of incoming requests to the L2 from all clients for all + request types, per normalization unit. + Read Req: The total number of read requests to the L2 from all clients. + Write Req: The total number of write requests to the L2 from all clients. + Atomic Req: The total number of atomic requests (with and without return) to the + L2 from all clients. + Streaming Req: The total number of incoming requests to the L2 that are marked + as streaming. The exact meaning of this may differ depending on the targeted + accelerator, however on an MI2XX this corresponds to non-temporal load or stores. + The L2 cache attempts to evict streaming requests before normal requests when + the L2 is at capacity. + Probe Req: The number of coherence probe requests made to the L2 cache from outside + the accelerator. On an MI2XX, probe requests may be generated by, for example, + writes to fine-grained device memory or by writes to coarse-grained device memory. + Cache Hit: The ratio of the number of L2 cache line requests that hit in the L2 + cache over the total number of incoming cache line requests to the L2 cache. + Hits: The total number of requests to the L2 from all clients that hit in the + cache. As noted in the Speed-of-Light section, this includes hit-on-miss requests. + Misses: The total number of requests to the L2 from all clients that miss in the + cache. As noted in the Speed-of-Light section, these do not include hit-on-miss + requests. + Writeback: The total number of L2 cache lines written back to memory for any reason. + Write-backs may occur due to user code (such as HIP kernel calls to _threadfence_system + or atomic built-ins) by the command processor's memory acquire/release fences, + or for other internal hardware reasons. + Writeback (Internal): The total number of L2 cache lines written back to memory + for internal hardware reasons, per normalization unit. + Writeback (vL1D Req): The total number of L2 cache lines written back to memory + due to requests initiated by the vL1D cache, per normalization unit. + Evict (Internal): The total number of L2 cache lines evicted from the cache due + to capacity limits, per normalization unit. + Evict (vL1D Req): The total number of L2 cache lines evicted from the cache due + to invalidation requests initiated by the vL1D cache, per normalization unit. + NC Req: The total number of requests to the L2 to Not-hardware-Coherent (NC) memory + allocations, per normalization unit. + UC Req: The total number of requests to the L2 that go to Uncached (UC) memory + allocations. + CC Req: The total number of requests to the L2 that go to Coherently Cacheable + (CC) memory allocations. + RW Req: The total number of requests to the L2 that go to Read-Write coherent + memory (RW) allocations. + Write - Credit Starvation: The number of cycles the L2-Fabric interface was stalled + on write or atomic requests to any memory location because too many write/atomic + requests were currently in flight, as a percent of the total active L2 cycles. + Read (32B): The total number of L2 requests to Infinity Fabric to read 32B of + data from any memory location, per normalization unit. + Read (64B): The total number of L2 requests to Infinity Fabric to read 64B of + data from any memory location, per normalization unit. + Read (Uncached): The total number of L2 requests to Infinity Fabric to read uncached + data from any memory location, per normalization unit. 64B requests for uncached + data are counted as two 32B uncached data requests. + HBM Read: The total number of L2 requests to Infinity Fabric to read 32B or 64B + of data from the accelerator's local HBM, per normalization unit. + Remote Read: The total number of L2 requests to Infinity Fabric to read 32B or + 64B of data from any source other than the accelerator's local HBM, per normalization + unit. + Write and Atomic (32B): The total number of L2 requests to Infinity Fabric to + write or atomically update 32B of data to any memory location, per normalization + unit. + Write and Atomic (Uncached): The total number of L2 requests to Infinity Fabric + to write or atomically update 32B or 64B of uncached data, per normalization + unit. + Write and Atomic (64B): The total number of L2 requests to Infinity Fabric to + write or atomically update 64B of data in any memory location, per normalization + unit. + HBM Write and Atomic: The total number of L2 requests to Infinity Fabric to write + or atomically update 32B or 64B of data in the accelerator's local HBM, per + normalization unit. + Remote Write and Atomic: The total number of L2 requests to Infinity Fabric to + write or atomically update 32B or 64B of data in any memory location other than + the accelerator's local HBM, per normalization unit. + Atomic: The total number of L2 requests to Infinity Fabric to atomically update + 32B or 64B of data in any memory location, per normalization unit. See Request + flow for more detail. Note that on current CDNA accelerators, such as the MI2XX, + requests are only considered atomic by Infinity Fabric if they are targeted + at non-write-cacheable memory, such as fine-grained memory allocations or uncached + memory allocations on the MI2XX. + Read Stall: "The ratio of the total number of cycles the L2-Fabric interface was\ + \ stalled on a read request to any destination (local HBM, remote PCIe\xAE connected\ + \ accelerator or CPU, or remote Infinity Fabric connected accelerator or CPU)\ + \ over the total active L2 cycles." + Write Stall: The ratio of the total number of cycles the L2-Fabric interface was + stalled on a write or atomic request to any destination (local HBM, remote accelerator + or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected + accelerator or CPU) over the total active L2 cycles. + Read - PCIe Stall: The number of cycles the L2-Fabric interface was stalled on + read requests to remote PCIe connected accelerators or CPUs as a percent of + the total active L2 cycles. + Read - Infinity Fabric Stall: The number of cycles the L2-Fabric interface was + stalled on read requests to remote Infinity Fabric connected accelerators or + CPUs as a percent of the total active L2 cycles. + Read - HBM Stall: The number of cycles the L2-Fabric interface was stalled on + read requests to the accelerator's local HBM as a percent of the total active + L2 cycles. + Write - PCIe Stall: The number of cycles the L2-Fabric interface was stalled on + write or atomic requests to remote PCIe connected accelerators or CPUs as a + percent of the total active L2 cycles. + Write - Infinity Fabric Stall: The number of cycles the L2-Fabric interface was + stalled on write or atomic requests to remote Infinity Fabric connected accelerators + or CPUs as a percent of the total active L2 cycles. + Write - HBM Stall: The number of cycles the L2-Fabric interface was stalled on + write or atomic requests to accelerator's local HBM as a percent of the total + active L2 cycles. + data source: + - metric_table: + id: 1701 + title: L2 Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + metric: + Utilization: + value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD))) + unit: pct + Peak Bandwidth: + value: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) + / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) + unit: pct + Hit Rate: + value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else 0)) + unit: pct + L2-Fabric Read BW: + value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + L2-Fabric Write and Atomic BW: + value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + HBM Bandwidth: + value: $hbmBandwidth + unit: GB/s + - metric_table: + id: 1702 + title: L2-Fabric interface metrics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Read BW: + avg: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + + (TCC_EA0_RDREQ_128B_sum * 128)) / $denom)) + min: MIN((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + + (TCC_EA0_RDREQ_128B_sum * 128)) / $denom)) + max: MAX((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + + (TCC_EA0_RDREQ_128B_sum * 128)) / $denom)) + unit: (Bytes + $normUnit) + HBM Read Traffic: + avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + unit: pct + Remote Read Traffic: + avg: AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / + TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + min: MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / + TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + max: MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / + TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + unit: pct + Uncached Read Traffic: + avg: AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + unit: pct + Write and Atomic BW: + avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + unit: (Bytes + $normUnit) + HBM Write and Atomic Traffic: + avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: pct + Remote Write and Atomic Traffic: + avg: AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) + if (TCC_EA0_WRREQ_sum != 0) else None)) + min: MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) + if (TCC_EA0_WRREQ_sum != 0) else None)) + max: MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) + if (TCC_EA0_WRREQ_sum != 0) else None)) + unit: pct + Atomic Traffic: + avg: AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: pct + Uncached Write and Atomic Traffic: + avg: AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: pct + Read Latency: + avg: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + min: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + max: MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + unit: Cycles + Write and Atomic Latency: + avg: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + min: MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + max: MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: Cycles + Atomic Latency: + avg: AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else None)) + min: MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else None)) + max: MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else None)) + unit: Cycles + Read Stall: + avg: AVG((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum) + + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum + != 0) else None)) + min: MIN((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum) + + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum + != 0) else None)) + max: MAX((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum) + + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum + != 0) else None)) + unit: pct + Write Stall: + avg: AVG(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum + != 0) else None)) + min: MIN(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum + != 0) else None)) + max: MAX(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum + != 0) else None)) + unit: pct + - metric_table: + id: 1703 + title: L2 Cache Accesses + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Bandwidth: + avg: AVG((TCC_REQ_sum * 128) / $denom) + min: MIN((TCC_REQ_sum * 128) / $denom) + max: MAX((TCC_REQ_sum * 128) / $denom) + unit: (Bytes + $normUnit) + Read Bandwidth: + avg: AVG(TCC_READ_SECTORS_sum * 32/ $denom) + min: MIN(TCC_READ_SECTORS_sum * 32/ $denom) + max: MAX(TCC_READ_SECTORS_sum * 32/ $denom) + unit: (Bytes + $normUnit) + Write Bandwidth: + avg: AVG(TCC_WRITE_SECTORS_sum * 32/ $denom) + min: MIN(TCC_WRITE_SECTORS_sum * 32/ $denom) + max: MAX(TCC_WRITE_SECTORS_sum * 32/ $denom) + unit: (Bytes + $normUnit) + Atomic Bandwidth: + avg: AVG(TCC_ATOMIC_SECTORS_sum * 32/ $denom) + min: MIN(TCC_ATOMIC_SECTORS_sum * 32/ $denom) + max: MAX(TCC_ATOMIC_SECTORS_sum * 32/ $denom) + unit: (Bytes + $normUnit) + Req: + avg: AVG((TCC_REQ_sum / $denom)) + min: MIN((TCC_REQ_sum / $denom)) + max: MAX((TCC_REQ_sum / $denom)) + unit: (Req + $normUnit) + Read Req: + avg: AVG((TCC_READ_sum / $denom)) + min: MIN((TCC_READ_sum / $denom)) + max: MAX((TCC_READ_sum / $denom)) + unit: (Req + $normUnit) + Write Req: + avg: AVG((TCC_WRITE_sum / $denom)) + min: MIN((TCC_WRITE_sum / $denom)) + max: MAX((TCC_WRITE_sum / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG((TCC_ATOMIC_sum / $denom)) + min: MIN((TCC_ATOMIC_sum / $denom)) + max: MAX((TCC_ATOMIC_sum / $denom)) + unit: (Req + $normUnit) + Streaming Req: + avg: AVG((TCC_STREAMING_REQ_sum / $denom)) + min: MIN((TCC_STREAMING_REQ_sum / $denom)) + max: MAX((TCC_STREAMING_REQ_sum / $denom)) + unit: (Req + $normUnit) + Bypasss Req: + avg: AVG((TCC_BYPASS_REQ_sum / $denom)) + min: MIN((TCC_BYPASS_REQ_sum / $denom)) + max: MAX((TCC_BYPASS_REQ_sum / $denom)) + unit: (Req + $normUnit) + Probe Req: + avg: AVG((TCC_PROBE_sum / $denom)) + min: MIN((TCC_PROBE_sum / $denom)) + max: MAX((TCC_PROBE_sum / $denom)) + unit: (Req + $normUnit) + Input Buffer Req: + avg: AVG((TCC_IB_REQ_sum / $denom)) + min: MIN((TCC_IB_REQ_sum / $denom)) + max: MAX((TCC_IB_REQ_sum / $denom)) + unit: (Req + $normUnit) + Cache Hit: + avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + unit: pct + Hits: + avg: AVG((TCC_HIT_sum / $denom)) + min: MIN((TCC_HIT_sum / $denom)) + max: MAX((TCC_HIT_sum / $denom)) + unit: (Hits + $normUnit) + Misses: + avg: AVG((TCC_MISS_sum / $denom)) + min: MIN((TCC_MISS_sum / $denom)) + max: MAX((TCC_MISS_sum / $denom)) + unit: (Misses + $normUnit) + Writeback: + avg: AVG((TCC_WRITEBACK_sum / $denom)) + min: MIN((TCC_WRITEBACK_sum / $denom)) + max: MAX((TCC_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + Writeback (Internal): + avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom)) + min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom)) + max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + Writeback (vL1D Req): + avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + Evict (Internal): + avg: AVG((TCC_NORMAL_EVICT_sum / $denom)) + min: MIN((TCC_NORMAL_EVICT_sum / $denom)) + max: MAX((TCC_NORMAL_EVICT_sum / $denom)) + unit: (Cachelines + $normUnit) + Evict (vL1D Req): + avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + unit: (Cachelines + $normUnit) + NC Req: + avg: AVG((TCC_NC_REQ_sum / $denom)) + min: MIN((TCC_NC_REQ_sum / $denom)) + max: MAX((TCC_NC_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC Req: + avg: AVG((TCC_UC_REQ_sum / $denom)) + min: MIN((TCC_UC_REQ_sum / $denom)) + max: MAX((TCC_UC_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC Req: + avg: AVG((TCC_CC_REQ_sum / $denom)) + min: MIN((TCC_CC_REQ_sum / $denom)) + max: MAX((TCC_CC_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW Req: + avg: AVG((TCC_RW_REQ_sum / $denom)) + min: MIN((TCC_RW_REQ_sum / $denom)) + max: MAX((TCC_RW_REQ_sum / $denom)) + unit: (Req + $normUnit) + - metric_table: + id: 1704 + title: L2 Cache Stalls + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Stalled on Latency FIFO: + avg: AVG(TCC_LATENCY_FIFO_FULL_sum / $denom) + min: MIN(TCC_LATENCY_FIFO_FULL_sum / $denom) + max: MAX(TCC_LATENCY_FIFO_FULL_sum / $denom) + unit: (Cycles + $normUnit) + Stalled on Write Data FIFO: + avg: AVG(TCC_SRC_FIFO_FULL_sum / $denom) + min: MIN(TCC_SRC_FIFO_FULL_sum / $denom) + max: MAX(TCC_SRC_FIFO_FULL_sum / $denom) + unit: (Cycles + $normUnit) + Input Buffer Stalled on L2: + avg: AVG(TCC_IB_STALL_sum / $denom) + min: MIN(TCC_IB_STALL_sum / $denom) + max: MAX(TCC_IB_STALL_sum / $denom) + unit: (Cycles + $normUnit) + - metric_table: + id: 1705 + title: L2 - Fabric Interface stalls + header: + metric: Metric + type: Type + transaction: Transaction + avg: Avg + min: Min + max: Max + unit: Unit + style: + type: simple_multi_bar + metric: + Read - PCIe Stall: + type: PCIe Stall + transaction: Read + avg: AVG(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + unit: pct + Read - Infinity Fabric Stall: + type: "Infinity Fabric\u2122 Stall" + transaction: Read + avg: AVG(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + unit: pct + Read - HBM Stall: + type: HBM Stall + transaction: Read + avg: AVG(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + unit: pct + Write - PCIe Stall: + type: PCIe Stall + transaction: Write + avg: AVG(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + unit: pct + Write - Infinity Fabric Stall: + type: "Infinity Fabric\u2122 Stall" + transaction: Write + avg: AVG(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + unit: pct + Write - HBM Stall: + type: HBM Stall + transaction: Write + avg: AVG(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + unit: pct + Write - Credit Starvation: + type: Credit Starvation + transaction: Write + avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum + != 0) else None)) + min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum + != 0) else None)) + max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum + != 0) else None)) + unit: pct + - metric_table: + id: 1706 + title: L2 - Fabric interface detailed metrics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + Read (32B): + avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom)) + min: MIN((TCC_EA0_RDREQ_32B_sum / $denom)) + max: MAX((TCC_EA0_RDREQ_32B_sum / $denom)) + unit: (Req + $normUnit) + Read (64B): + avg: AVG((TCC_EA0_RDREQ_64B_sum / $denom)) + min: MIN((TCC_EA0_RDREQ_64B_sum / $denom)) + max: MAX((TCC_EA0_RDREQ_64B_sum / $denom)) + unit: (Req + $normUnit) + Read (128B): + avg: AVG((TCC_EA0_RDREQ_128B_sum / $denom)) + min: MIN((TCC_EA0_RDREQ_128B_sum / $denom)) + max: MAX((TCC_EA0_RDREQ_128B_sum / $denom)) + unit: (Req + $normUnit) + Read (Uncached): + avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) + max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) + unit: (Req + $normUnit) + HBM Read: + avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)) + min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom)) + max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom)) + unit: (Req + $normUnit) + Remote Read: + avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) + min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) + max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) + unit: (Req + $normUnit) + Write and Atomic (32B): + avg: AVG(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) + min: MIN(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) + max: MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) + unit: (Req + $normUnit) + Write and Atomic (Uncached): + avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) + max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) + unit: (Req + $normUnit) + Write and Atomic (64B): + avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom)) + min: MIN((TCC_EA0_WRREQ_64B_sum / $denom)) + max: MAX((TCC_EA0_WRREQ_64B_sum / $denom)) + unit: (Req + $normUnit) + HBM Write and Atomic: + avg: AVG((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) + min: MIN((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) + max: MAX((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) + unit: (Req + $normUnit) + Remote Write and Atomic: + avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) + min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) + max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) + unit: (Req + $normUnit) + Write Bandwidth - PCIe: + avg: AVG(TCC_EA0_WRREQ_WRITE_IO_32B_sum / $denom) + min: MIN(TCC_EA0_WRREQ_WRITE_IO_32B_sum / $denom) + max: MAX(TCC_EA0_WRREQ_WRITE_IO_32B_sum / $denom) + unit: (Bytes + $normUnit) + "Write Bandwidth - Infinity Fabric\u2122": + avg: AVG(TCC_EA0_WRREQ_WRITE_GMI_32B_sum / $denom) + min: MIN(TCC_EA0_WRREQ_WRITE_GMI_32B_sum / $denom) + max: MAX(TCC_EA0_WRREQ_WRITE_GMI_32B_sum / $denom) + unit: (Bytes + $normUnit) + Write Bandwidth - HBM: + avg: AVG(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum / $denom) + min: MIN(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum / $denom) + max: MAX(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum / $denom) + unit: (Bytes + $normUnit) + Atomic: + avg: AVG((TCC_EA0_ATOMIC_sum / $denom)) + min: MIN((TCC_EA0_ATOMIC_sum / $denom)) + max: MAX((TCC_EA0_ATOMIC_sum / $denom)) + unit: (Req + $normUnit) + Atomic - HBM: + avg: AVG((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom)) + min: MIN((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom)) + max: MAX((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom)) + unit: (Req + $normUnit) + Atomic Bandwidth - PCIe: + avg: AVG(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum / $denom) + min: MIN(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum / $denom) + max: MAX(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum / $denom) + unit: (Bytes + $normUnit) + "Atomic Bandwidth - Infinity Fabric\u2122": + avg: AVG(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum / $denom) + min: MIN(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum / $denom) + max: MAX(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum / $denom) + unit: (Bytes + $normUnit) + Atomic Bandwidth - HBM: + avg: AVG(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum / $denom) + min: MIN(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum / $denom) + max: MAX(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum / $denom) + unit: (Bytes + $normUnit) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1800_L2_cache_per_channel.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1800_L2_cache_per_channel.yaml deleted file mode 100644 index 67087415a8..0000000000 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1800_L2_cache_per_channel.yaml +++ /dev/null @@ -1,298 +0,0 @@ ---- -# Add description/tips for each metric in this section. -# So it could be shown in hover. -Metric Description: - -# Define the panel properties and properties of each metric in the panel. -Panel Config: - id: 1800 - title: L2 Cache (per Channel) - data source: - - metric_table: - id: 1801 - title: Aggregate Stats (All channels) - header: - metric: Metric - avg: Avg - std dev: Std Dev - min: Min - max: Max - unit: Unit - tips: Tips - metric: - L2 Cache Hit Rate: - avg: AVG(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) - + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * - TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) - + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 - * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) - / (((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) != 0) else None) - std dev: STD(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) - + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * - TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) - + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 - * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) - / (((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) != 0) else None) - min: MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) - + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * - TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) - + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 - * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) - / (((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) != 0) else None) - max: MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) - + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * - TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) - + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 - * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) - / (((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) != 0) else None) - unit: pct - tips: - # FIXME: other arggr metrics!! - - - metric_table: - id: 1802 - title: L2 Cache Hit Rate (pct) - header: - metric: Channel - expr: Expression - metric: - "::_1": - expr: - (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1] - + TCC_MISS[::_1]) != 0) else None) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_box - - - metric_table: - id: 1803 - title: L2 Requests (per normUnit) - header: - metric: Channel - expr: Expression - metric: - "::_1": - expr: (TO_INT(TCC_REQ[::_1]) / $denom) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_box - - - metric_table: - id: 1804 - title: L2 Requests (per normUnit) - header: - metric: Channel - read req: L2 Read - write req: L2 Write - atomic req: L2 Atomic - metric: - "::_1": - read req: AVG((TO_INT(TCC_READ[::_1]) / $denom)) - write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom)) - atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom)) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_multiple_bar - - - metric_table: - id: 1805 - title: L2-Fabric Requests (per normUnit) - header: - metric: Channel - read req: L2-Fabric Read - write req: L2-Fabric Write and Atomic - atomic req: L2-Fabric Atomic - metric: - "::_1": - read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom)) - write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom)) - atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom)) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_multiple_bar - - # - metric_table: - # id: 1806 - # title: L2-Fabric Latency (Cycles) - # header: - # metric: Metric - # read lat: L2-Fabric Read - # write lat: L2-Fabric Write - # atomic lat: L2-Fabric Atomic - # metric: - # "::_1": - # read lat: - # AVG(((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1] - # != 0) else None)) - # write lat: - # AVG(((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1] - # != 0) else None)) - # atomic lat: - # AVG(((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if - # (TCC_EA0_ATOMIC[::_1] != 0) else 0)) - # placeholder_range: - # "::_1": $total_l2_chan - # cli_style: simple_multiple_bar - - - metric_table: - id: 1806 - title: L2-Fabric Read Latency (Cycles) - header: - metric: Channel - expr: Expression - metric: - "::_1": - expr: - ((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1] - != 0) else None) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_box - - - metric_table: - id: 1807 - title: L2-Fabric Write and Atomic Latency (Cycles) - header: - metric: Channel - expr: Expression - metric: - "::_1": - expr: - ((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1] - != 0) else None) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_box - - - metric_table: - id: 1808 - title: L2-Fabric Atomic Latency (Cycles) - header: - metric: Channel - expr: Expression - metric: - "::_1": - expr: ((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if - (TCC_EA0_ATOMIC[::_1] != 0) else 0) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_box - - - metric_table: - id: 1809 - title: L2-Fabric Read Stall (Cycles per normUnit) - header: - metric: Channel - ea read stall - pcie: L2-Fabric Read Stall (PCIe) - ea read stall - if: L2-Fabric Read Stall (Infinity Fabric™) - ea read stall - hbm: L2-Fabric Read Stall (HBM) - metric: - "::_1": - ea read stall - pcie: AVG((TO_INT(TCC_EA0_RDREQ_IO_CREDIT_STALL[::_1]) / $denom)) - ea read stall - if: AVG((TO_INT(TCC_EA0_RDREQ_GMI_CREDIT_STALL[::_1]) / $denom)) - ea read stall - hbm: AVG((TO_INT(TCC_EA0_RDREQ_DRAM_CREDIT_STALL[::_1]) / $denom)) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_multiple_bar - - - metric_table: - id: 1810 - title: L2-Fabric Write and Atomic Stall (Cycles per normUnit) - header: - metric: Channel - ea write stall - pcie: L2-Fabric Write Stall (PCIe) - ea write stall - if: L2-Fabric Write Stall (Infinity Fabric™) - ea write stall - hbm: L2-Fabric Write Stall (HBM) - ea write stall - starve: L2-Fabric Write Starve - metric: - "::_1": - ea write stall - pcie: AVG((TO_INT(TCC_EA0_WRREQ_IO_CREDIT_STALL[::_1]) / $denom)) - ea write stall - if: AVG((TO_INT(TCC_EA0_WRREQ_GMI_CREDIT_STALL[::_1]) / $denom)) - ea write stall - hbm: AVG((TO_INT(TCC_EA0_WRREQ_DRAM_CREDIT_STALL[::_1]) / $denom)) - ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1]) / $denom)) - placeholder_range: - "::_1": $total_l2_chan - cli_style: simple_multiple_bar - - # - metric_table: - # id: 1811 - # title: L2 Tag Stall (cycles) - # header: - # metric: Metric - # expr: Expression - # metric: - # "::_1": - # expr: TCC_TAG_STALL[::_1] - # placeholder_range: - # "::_1": $total_l2_chan - # cli_style: simple_box - - - metric_table: - id: 1812 - title: L2-Fabric (128B read requests per normUnit) - header: - metric: Channel - expr: Expression - metric: - "::_1": - expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom) - placeholder_range: - "::_1": $total_l2_chan - # tips: Number of 128-byte read requests sent to EA - cli_style: simple_box diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1800_l2_cache_per_channel.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1800_l2_cache_per_channel.yaml new file mode 100644 index 0000000000..09a1298380 --- /dev/null +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1800_l2_cache_per_channel.yaml @@ -0,0 +1,257 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +Panel Config: + id: 1800 + title: L2 Cache (per Channel) + metrics_description: + L2 Cache Hit Rate: The percent of total number of requests to the L2 from all + clients that hit in the cache. As noted in the Speed-of-Light section, this + includes hit-on-miss requests. + data source: + - metric_table: + id: 1801 + title: Aggregate Stats (All channels) + header: + metric: Metric + avg: Avg + std dev: Std Dev + min: Min + max: Max + unit: Unit + metric: + L2 Cache Hit Rate: + avg: AVG(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 + * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 + * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * + TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) + / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) != 0) else None) + std dev: STD(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 + * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 + * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * + TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) + / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) != 0) else None) + min: MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 + * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 + * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * + TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) + / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) != 0) else None) + max: MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 + * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 + * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * + TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) + / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) != 0) else None) + unit: pct + - metric_table: + id: 1802 + title: L2 Cache Hit Rate (pct) + header: + metric: Channel + expr: Expression + metric: + ::_1: + expr: (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1] + + TCC_MISS[::_1]) != 0) else None) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1803 + title: L2 Requests (per normUnit) + header: + metric: Channel + expr: Expression + metric: + ::_1: + expr: (TO_INT(TCC_REQ[::_1]) / $denom) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1804 + title: L2 Requests (per normUnit) + header: + metric: Channel + read req: L2 Read + write req: L2 Write + atomic req: L2 Atomic + metric: + ::_1: + read req: AVG((TO_INT(TCC_READ[::_1]) / $denom)) + write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom)) + atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom)) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_multiple_bar + tui_style: simple_multiple_bar + - metric_table: + id: 1805 + title: L2-Fabric Requests (per normUnit) + header: + metric: Channel + read req: L2-Fabric Read + write req: L2-Fabric Write and Atomic + atomic req: L2-Fabric Atomic + metric: + ::_1: + read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom)) + write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom)) + atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom)) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_multiple_bar + tui_style: simple_multiple_bar + - metric_table: + id: 1806 + title: L2-Fabric Read Latency (Cycles) + header: + metric: Channel + expr: Expression + metric: + ::_1: + expr: ((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1] + != 0) else None) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1807 + title: L2-Fabric Write and Atomic Latency (Cycles) + header: + metric: Channel + expr: Expression + metric: + ::_1: + expr: ((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1] + != 0) else None) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1808 + title: L2-Fabric Atomic Latency (Cycles) + header: + metric: Channel + expr: Expression + metric: + ::_1: + expr: ((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if (TCC_EA0_ATOMIC[::_1] + != 0) else 0) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1809 + title: L2-Fabric Read Stall (Cycles per normUnit) + header: + metric: Channel + ea read stall - pcie: L2-Fabric Read Stall (PCIe) + ea read stall - if: "L2-Fabric Read Stall (Infinity Fabric\u2122)" + ea read stall - hbm: L2-Fabric Read Stall (HBM) + metric: + ::_1: + ea read stall - pcie: AVG((TO_INT(TCC_EA0_RDREQ_IO_CREDIT_STALL[::_1]) / + $denom)) + ea read stall - if: AVG((TO_INT(TCC_EA0_RDREQ_GMI_CREDIT_STALL[::_1]) / + $denom)) + ea read stall - hbm: AVG((TO_INT(TCC_EA0_RDREQ_DRAM_CREDIT_STALL[::_1]) + / $denom)) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_multiple_bar + tui_style: simple_multiple_bar + - metric_table: + id: 1810 + title: L2-Fabric Write and Atomic Stall (Cycles per normUnit) + header: + metric: Channel + ea write stall - pcie: L2-Fabric Write Stall (PCIe) + ea write stall - if: "L2-Fabric Write Stall (Infinity Fabric\u2122)" + ea write stall - hbm: L2-Fabric Write Stall (HBM) + ea write stall - starve: L2-Fabric Write Starve + metric: + ::_1: + ea write stall - pcie: AVG((TO_INT(TCC_EA0_WRREQ_IO_CREDIT_STALL[::_1]) + / $denom)) + ea write stall - if: AVG((TO_INT(TCC_EA0_WRREQ_GMI_CREDIT_STALL[::_1]) / + $denom)) + ea write stall - hbm: AVG((TO_INT(TCC_EA0_WRREQ_DRAM_CREDIT_STALL[::_1]) + / $denom)) + ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1]) + / $denom)) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_multiple_bar + tui_style: simple_multiple_bar + - metric_table: + id: 1812 + title: L2-Fabric (128B read requests per normUnit) + header: + metric: Channel + expr: Expression + metric: + ::_1: + expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/2100_pc_sampling.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/2100_pc_sampling.yaml index d6c4ff393d..e94471d7dc 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/2100_pc_sampling.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/2100_pc_sampling.yaml @@ -1,10 +1,11 @@ ---- +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py Panel Config: id: 2100 title: PC Sampling + metrics_description: {} data source: - - pc_sampling_table: - id: 2101 - title: PC Sampling - source: ps_file - comparable: false # enable it later + - pc_sampling_table: + id: 2101 + title: PC Sampling + source: ps_file + comparable: false diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/panel_config_template.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/panel_config_template.yaml index 2221b06ee2..44ef9679d2 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/panel_config_template.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/panel_config_template.yaml @@ -14,7 +14,7 @@ # Metric Description: - METRIC01: &METRIC01_anchor Scalar Arithmetic Logic Unit + METRIC01: Scalar Arithmetic Logic Unit # Define the panel properties and properties of each metric in the panel. Panel Config: @@ -31,20 +31,17 @@ Panel Config: unit: Unit peak: Peak pop: Pct of Peak - tips: Tips metric: METRIC01: value: AVG(100 * SQ_ACTIVE_INST_SCA / ( $GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu )) unit: pct peak: 100 pop: AVG(100* SQ_ACTIVE_INST_SCA/($GRBM_GUI_ACTIVE_PER_XCD*$cu_per_gpu)) - tips: *METRIC01_anchor METRIC02: value: AVG(100 * SQ_ACTIVE_INST_VALU / ( $GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) unit: pct peak: 100 pop: AVG(100* SQ_ACTIVE_INST_VALU/($GRBM_GUI_ACTIVE_PER_XCD*$cu_per_gpu)) - tips: # CSV table sample - raw_csv_table: id: 802 diff --git a/projects/rocprofiler-compute/src/utils/gui.py b/projects/rocprofiler-compute/src/utils/gui.py index 6972ae89bb..556eb0a466 100644 --- a/projects/rocprofiler-compute/src/utils/gui.py +++ b/projects/rocprofiler-compute/src/utils/gui.py @@ -49,12 +49,16 @@ def filter_df(column, df, filt): def multi_bar_chart(table_id, display_df): if table_id == 1604: - nested_bar = {"NC": {}, "UC": {}, "RW": {}, "CC": {}} + nested_bar = {} for index, row in display_df.iterrows(): + if not row["Coherency"] in nested_bar: + nested_bar[row["Coherency"]] = {} nested_bar[row["Coherency"]][row["Xfer"]] = row["Avg"] if table_id == 1705: # L2 - Fabric Interface Stalls - nested_bar = {"Read": {}, "Write": {}} + nested_bar = {} for index, row in display_df.iterrows(): + if not row["Transaction"] in nested_bar: + nested_bar[row["Transaction"]] = {} nested_bar[row["Transaction"]][row["Type"]] = row["Avg"] return nested_bar @@ -307,14 +311,14 @@ def build_table_chart( else: formatted_columns.append(dict(id=col, name=col, type="text")) - # tooltip shows only on the 1st col for now if 'Tips' available + # tooltip shows only on the 1st col for now if 'Metric Description' available table_tooltip = ( [ { column: { "value": ( - str(row["Tips"]) - if column == display_columns[0] and row["Tips"] + str(row["Description"]) + if column == display_columns[0] and row["Description"] else "" ), "type": "markdown", @@ -323,7 +327,7 @@ def build_table_chart( } for row in original_df.to_dict("records") ] - if "Tips" in original_df.columns.values.tolist() + if "Description" in original_df.columns.values.tolist() else None ) diff --git a/projects/rocprofiler-compute/src/utils/gui_components/memchart.py b/projects/rocprofiler-compute/src/utils/gui_components/memchart.py index 7516b49d78..5ab2027972 100644 --- a/projects/rocprofiler-compute/src/utils/gui_components/memchart.py +++ b/projects/rocprofiler-compute/src/utils/gui_components/memchart.py @@ -25,7 +25,6 @@ from dash import html from dash_svg import G, Path, Rect, Svg, Text -from config import HIDDEN_COLUMNS from utils.logger import console_error @@ -312,7 +311,7 @@ def insert_chart_data(mem_data, base_data): id="sl1_rd", fill="#FFFFFF", fontSize="12px", - children=format_value_for_display(memchart_values["VL1D Rd"]), + children=memchart_values["sL1D Rd"], ), Text( x="838", @@ -320,7 +319,7 @@ def insert_chart_data(mem_data, base_data): id="sl1_hit", fill="rgb(0, 0, 0)", fontSize="12px", - children=memchart_values["VL1D Hit"], + children=memchart_values["sL1D Hit"], ), Text( x="838", @@ -328,7 +327,7 @@ def insert_chart_data(mem_data, base_data): id="sl1_lat", fill="rgb(0, 0, 0)", fontSize="12px", - children=memchart_values["VL1D Lat"], + children=memchart_values["sL1D Lat"], ), Text( x="1000", @@ -336,7 +335,7 @@ def insert_chart_data(mem_data, base_data): id="sl1_l2_rd", fill="#FFFFFF", fontSize="12px", - children=format_value_for_display(memchart_values["VL1D_L2 Rd"]), + children=memchart_values["sL1D_L2 Rd"], ), Text( x="1000", @@ -344,7 +343,7 @@ def insert_chart_data(mem_data, base_data): id="sl1_l2_wr", fill="#FFFFFF", fontSize="12px", - children=format_value_for_display(memchart_values["VL1D_L2 Wr"]), + children=memchart_values["sL1D_L2 Wr"], ), Text( x="1008", @@ -352,7 +351,7 @@ def insert_chart_data(mem_data, base_data): id="sl1_l2_atom", fill="#FFFFFF", fontSize="12px", - children=memchart_values["VL1D_L2 Atomic"], + children=memchart_values["sL1D_L2 Atomic"], ), # ---------------------------------------- # Instr L1 Cache Block diff --git a/projects/rocprofiler-compute/src/utils/mem_chart.py b/projects/rocprofiler-compute/src/utils/mem_chart.py index f6e078835d..6d3211d0b5 100644 --- a/projects/rocprofiler-compute/src/utils/mem_chart.py +++ b/projects/rocprofiler-compute/src/utils/mem_chart.py @@ -1079,7 +1079,7 @@ class MemChart: wires_E_GLV.vl1_rd = metric_dict["VL1 Rd"] wires_E_GLV.vl1_wr = metric_dict["VL1 Wr"] wires_E_GLV.vl1_atomic = metric_dict["VL1 Atomic"] - wires_E_GLV.sl1_rd = metric_dict["VL1D Rd"] + wires_E_GLV.sl1_rd = metric_dict["sL1D Rd"] wires_E_GLV.draw(canvas) @@ -1146,8 +1146,8 @@ class MemChart: block_const_L1.y_max = block_vector_L1.y_min - 3 block_const_L1.y_min = block_const_L1.y_max - 5 - block_const_L1.hit = metric_dict["VL1D Hit"] - block_const_L1.latency = metric_dict["VL1D Lat"] + block_const_L1.hit = metric_dict["sL1D Hit"] + block_const_L1.latency = metric_dict["sL1D Lat"] block_const_L1.draw(canvas) @@ -1174,9 +1174,9 @@ class MemChart: wires_L1_L2.vl1_l2_rd = metric_dict["VL1_L2 Rd"] wires_L1_L2.vl1_l2_wr = metric_dict["VL1_L2 Wr"] wires_L1_L2.vl1_l2_atomic = metric_dict["VL1_L2 Atomic"] - wires_L1_L2.sl1_l2_rd = metric_dict["VL1D_L2 Rd"] - wires_L1_L2.sl1_l2_wr = metric_dict["VL1D_L2 Wr"] - wires_L1_L2.sl1_l2_atomic = metric_dict["VL1D_L2 Atomic"] + wires_L1_L2.sl1_l2_rd = metric_dict["sL1D_L2 Rd"] + wires_L1_L2.sl1_l2_wr = metric_dict["sL1D_L2 Wr"] + wires_L1_L2.sl1_l2_atomic = metric_dict["sL1D_L2 Atomic"] wires_L1_L2.il1_l2_req = metric_dict["IL1_L2 Rd"] wires_L1_L2.draw(canvas) @@ -1331,9 +1331,9 @@ if __name__ == "__main__": metric_dict["VL1 Coalesce"] = 27 metric_dict["VL1 Stall"] = 28 - metric_dict["VL1D Rd"] = 29 - metric_dict["VL1D Hit"] = 30 - metric_dict["VL1D Lat"] = 31 + metric_dict["sL1D Rd"] = 29 + metric_dict["sL1D Hit"] = 30 + metric_dict["sL1D Lat"] = 31 metric_dict["IL1 Fetch"] = 32 metric_dict["IL1 Hit"] = 33 @@ -1344,9 +1344,9 @@ if __name__ == "__main__": metric_dict["VL1_L2 Wr"] = 37 metric_dict["VL1_L2 Atomic"] = 38 - metric_dict["VL1D_L2 Rd"] = 39 - metric_dict["VL1D_L2 Wr"] = 40 - metric_dict["VL1D_L2 Atomic"] = 41 + metric_dict["sL1D_L2 Rd"] = 39 + metric_dict["sL1D_L2 Wr"] = 40 + metric_dict["sL1D_L2 Atomic"] = 41 metric_dict["IL1_L2 Rd"] = 42 metric_dict["L2 Hit"] = 43 diff --git a/projects/rocprofiler-compute/src/utils/mi_gpu_spec.py b/projects/rocprofiler-compute/src/utils/mi_gpu_spec.py index 85d6421c28..e2b52b1f6c 100644 --- a/projects/rocprofiler-compute/src/utils/mi_gpu_spec.py +++ b/projects/rocprofiler-compute/src/utils/mi_gpu_spec.py @@ -228,7 +228,7 @@ class MIGPUSpecs: gpu_arch_lower = gpu_arch_.lower() # Handle gfx942 with chip_id mapping - if gpu_arch_lower not in ("gfx906", "gfx908", "gfx90a"): + if gpu_arch_lower not in ("gfx908", "gfx90a"): if chip_id_ and int(chip_id_) in cls._chip_id_dict: gpu_model = cls._chip_id_dict.get(int(chip_id_)) else: @@ -283,7 +283,7 @@ class MIGPUSpecs: 4. Default settings (last resort) """ # Constants for legacy GPUs that don't support compute partitions - LEGACY_ARCHS = {"gfx906", "gfx908", "gfx90a"} + LEGACY_ARCHS = {"gfx908", "gfx90a"} LEGACY_MODELS = {"mi50", "mi60", "mi100", "mi210", "mi250", "mi250x"} # Normalize inputs to lowercase for consistent comparison diff --git a/projects/rocprofiler-compute/src/utils/parser.py b/projects/rocprofiler-compute/src/utils/parser.py index 164558e75c..a12040d179 100644 --- a/projects/rocprofiler-compute/src/utils/parser.py +++ b/projects/rocprofiler-compute/src/utils/parser.py @@ -509,17 +509,19 @@ def build_dfs(archConfigs, filter_metrics, sys_info): headers.append(k) for key, tile in data_config["header"].items(): - if key != "metric" and key != "tips" and key != "expr": + if key != "metric" and key != "expr": headers.append(tile) else: + headers.append(data_config["header"]["metric"]) for key, tile in data_config["header"].items(): - if key != "tips": + if key != "metric": headers.append(tile) - # do we always need one? headers.append("coll_level") - if "tips" in data_config["header"].keys(): - headers.append(data_config["header"]["tips"]) + + # Only add Metrics Description column if it is defined in the panel + if "metrics_description" in panel: + headers.append("Description") df = pd.DataFrame(columns=headers) @@ -563,16 +565,12 @@ def build_dfs(archConfigs, filter_metrics, sys_info): for bk, bv in simple_box.items(): values.append(bv[0] + v + bv[1]) else: - if ( - k != "tips" - and k != "coll_level" - and k != "alias" - ): + if k != "coll_level" and k != "alias": values.append(v) else: for k, v in entries.items(): - if k != "tips" and k != "coll_level" and k != "alias": + if k != "coll_level" and k != "alias": values.append(v) eqn_content.append(v) @@ -584,8 +582,11 @@ def build_dfs(archConfigs, filter_metrics, sys_info): else: values.append(schema.pmc_perf_file_prefix) - if "tips" in entries.keys(): - values.append(entries["tips"]) + if "metrics_description" in panel: + if key in panel["metrics_description"]: + values.append(panel["metrics_description"][key]) + else: + values.append("") # print(headers, values) # print(key, entries) @@ -1459,7 +1460,14 @@ def build_comparable_columns(time_unit): Build comparable columns/headers for display """ comparable_columns = schema.supported_field - top_stat_base = ["Count", "Sum", "Mean", "Median", "Standard Deviation"] + top_stat_base = [ + "Count", + "Sum", + "Mean", + "Median", + "Standard Deviation", + "Description", + ] for h in top_stat_base: comparable_columns.append(h + "(" + time_unit + ")") diff --git a/projects/rocprofiler-compute/src/utils/tty.py b/projects/rocprofiler-compute/src/utils/tty.py index 0c75f3c54b..90ea11d1bd 100644 --- a/projects/rocprofiler-compute/src/utils/tty.py +++ b/projects/rocprofiler-compute/src/utils/tty.py @@ -23,6 +23,7 @@ ##############################################################################el import copy +import textwrap from pathlib import Path import pandas as pd @@ -51,8 +52,21 @@ def string_multiple_lines(source, width, max_rows): def get_table_string(df, transpose=False, decimal=2): + """ + Convert DataFrame to a formatted table string, wrapping specified columns. + """ + df_to_show = df.transpose() if transpose else df + wrap_columns = ["Description"] + wrap_width = 40 + for col in wrap_columns: + if col in df_to_show.columns: + df_to_show[col] = ( + df_to_show[col] + .astype(str) + .apply(lambda x: textwrap.fill(x, width=wrap_width)) + ) return tabulate( - df.transpose() if transpose else df, + df_to_show, headers="keys", tablefmt="fancy_grid", floatfmt="." + str(decimal) + "f", @@ -118,6 +132,10 @@ def show_all(args, runs, archConfigs, output, profiling_config, roof_plot=None): int(convert_metric_id_to_panel_info(metric_id)[0]) for metric_id in filter_panel_ids ] + if args.include_cols: + hidden_cols = list(set(config.HIDDEN_COLUMNS_CLI) - set(args.include_cols)) + else: + hidden_cols = config.HIDDEN_COLUMNS_CLI for panel_id, panel in archConfigs.panel_configs.items(): # Skip panels that don't support baseline comparison @@ -196,12 +214,14 @@ def show_all(args, runs, archConfigs, output, profiling_config, roof_plot=None): df = pd.DataFrame(index=base_df.index) for header in list(base_df.keys()): + # For raw csv table, columns cannot be filtered + # If columns are filtered, then skip the headers not in filtered columns if ( - (not args.cols) - or (args.cols and base_df.columns.get_loc(header) in args.cols) - or (type == "raw_csv_table") + type == "raw_csv_table" + or not args.cols + or base_df.columns.get_loc(header) in args.cols ): - if header in config.HIDDEN_COLUMNS: + if header in hidden_cols: pass elif header not in comparable_columns: if ( @@ -236,8 +256,7 @@ def show_all(args, runs, archConfigs, output, profiling_config, roof_plot=None): cur_df = convert_time_columns(cur_df, args.time_unit) if (type == "raw_csv_table") or ( - type == "metric_table" - and (not header in config.HIDDEN_COLUMNS) + type == "metric_table" and (not header in hidden_cols) ): if run != base_run: # calc percentage over the baseline diff --git a/projects/rocprofiler-compute/tests/test_TCP_counters.py b/projects/rocprofiler-compute/tests/test_TCP_counters.py index 5728257cfa..a591539788 100644 --- a/projects/rocprofiler-compute/tests/test_TCP_counters.py +++ b/projects/rocprofiler-compute/tests/test_TCP_counters.py @@ -174,7 +174,7 @@ def test_L1_cache_counters( # 3. save results in local # FIXME: customize file name to avoid hardcode - csv_path = workload_dir + "/16.3_L1D_Cache_Accesses.csv" + csv_path = workload_dir + "/16.3_vL1D_cache_access_metrics.csv" data = load_metrics(csv_path) for metric in metrics: diff --git a/projects/rocprofiler-compute/tests/test_analyze_commands.py b/projects/rocprofiler-compute/tests/test_analyze_commands.py index 4b4c2ce74f..b3a519a125 100644 --- a/projects/rocprofiler-compute/tests/test_analyze_commands.py +++ b/projects/rocprofiler-compute/tests/test_analyze_commands.py @@ -470,37 +470,13 @@ def test_save_dfs(binary_handler_analyze_rocprof_compute): assert code == 0 files_in_workload = os.listdir(output_path) - single_row_tables = [ - "0.1_Top_Kernels.csv", - "13.3_Instruction_Cache_-_L2_Interface.csv", - "18.1_Aggregate_Stats_(All_channels).csv", - ] for file_name in files_in_workload: df = pd.read_csv(output_path + "/" + file_name) - if file_name in single_row_tables: - assert len(df.index) == 1 - else: - assert len(df.index) >= 3 + assert len(df.index) >= 1 shutil.rmtree(output_path) test_utils.clean_output_dir(config["cleanup"], workload_dir) - for dir in indirs: - workload_dir = test_utils.setup_workload_dir(dir) - code = binary_handler_analyze_rocprof_compute( - ["analyze", "--path", workload_dir, "--save-dfs", output_path] - ) - assert code == 0 - - files_in_workload = os.listdir(output_path) - for file_name in files_in_workload: - df = pd.read_csv(output_path + "/" + file_name) - if file_name in single_row_tables: - assert len(df.index) == 1 - else: - assert len(df.index) >= 3 - test_utils.clean_output_dir(config["cleanup"], workload_dir) - @pytest.mark.col def test_col_1(binary_handler_analyze_rocprof_compute): @@ -519,7 +495,7 @@ def test_col_2(binary_handler_analyze_rocprof_compute): for dir in indirs: workload_dir = test_utils.setup_workload_dir(dir) code = binary_handler_analyze_rocprof_compute( - ["analyze", "--path", workload_dir, "--cols", "2"] + ["analyze", "--path", workload_dir, "--cols", "2", "--include-cols", "Description"] ) assert code == 0 diff --git a/projects/rocprofiler-compute/tests/test_autogen_config.py b/projects/rocprofiler-compute/tests/test_autogen_config.py new file mode 100644 index 0000000000..7b09ede7e9 --- /dev/null +++ b/projects/rocprofiler-compute/tests/test_autogen_config.py @@ -0,0 +1,43 @@ +import hashlib +from pathlib import Path + +import pytest +import yaml + + +@pytest.mark.autogen_config +def test_modification_time(): + # Ensure modification time of utils/unified_config.yaml is older than + # utils/autogen_hash.yaml + # docs/data/metrics_description.yaml and + # src/rocprof_compute_soc/analysis_configs/gfx*/*.yaml + + unified_config_path = Path("utils/unified_config.yaml") + hash_path = Path("utils/autogen_hash.yaml") + docs_config_path = Path("docs/data/metrics_description.yaml") + analysis_config_paths = list( + Path("src/rocprof_compute_soc/analysis_configs").glob("gfx*/*.yaml") + ) + + assert ( + unified_config_path.stat().st_mtime < hash_path.stat().st_mtime + ), f"{unified_config_path} is not older than {hash_path}" + + assert ( + unified_config_path.stat().st_mtime < docs_config_path.stat().st_mtime + ), f"{unified_config_path} is not older than {docs_config_path}" + + for analysis_config_path in analysis_config_paths: + assert ( + unified_config_path.stat().st_mtime < analysis_config_path.stat().st_mtime + ), f"{unified_config_path} is not older than {analysis_config_path}" + + # Ensure hash map consistency + + with open(hash_path, "r") as f: + hash_map = yaml.safe_load(f) + for file, hash in hash_map.items(): + file_hash = hashlib.sha256(Path(file).read_bytes()).hexdigest() + assert ( + file_hash == hash + ), f"Hash mismatch for {file}: expected {hash}, got {file_hash}" diff --git a/projects/rocprofiler-compute/tests/test_profile_general.py b/projects/rocprofiler-compute/tests/test_profile_general.py index 294c5a1a9e..01bf4ee9e4 100644 --- a/projects/rocprofiler-compute/tests/test_profile_general.py +++ b/projects/rocprofiler-compute/tests/test_profile_general.py @@ -41,7 +41,6 @@ import test_utils # TODO: MI350 What are the gpu models in MI 350 series SUPPORTED_ARCHS = { - "gfx906": {"mi50": ["MI50", "MI60"]}, "gfx908": {"mi100": ["MI100"]}, "gfx90a": {"mi200": ["MI210", "MI250", "MI250X"]}, "gfx940": {"mi300": ["MI300A_A0"]}, diff --git a/projects/rocprofiler-compute/tests/test_utils.py b/projects/rocprofiler-compute/tests/test_utils.py index 386a0acd0d..55afb14cfc 100644 --- a/projects/rocprofiler-compute/tests/test_utils.py +++ b/projects/rocprofiler-compute/tests/test_utils.py @@ -8702,7 +8702,7 @@ def test_add_counter_overwrite_existing(): updated_description = "Updated version" updated_expression = "updated_expr" - updated_architectures = ["gfx906", "gfx908"] + updated_architectures = ["gfx908"] updated_properties = ["P_UPDATED", "P_NEW"] diff --git a/projects/rocprofiler-compute/utils/autogen_hash.yaml b/projects/rocprofiler-compute/utils/autogen_hash.yaml new file mode 100644 index 0000000000..ec28448cca --- /dev/null +++ b/projects/rocprofiler-compute/utils/autogen_hash.yaml @@ -0,0 +1,110 @@ +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py +src/rocprof_compute_soc/analysis_configs/gfx908/0000_top_stats.yaml: 401770cff804c6e51b78dff61390d8b5977598a2b09c6601ac593653e912535b +src/rocprof_compute_soc/analysis_configs/gfx90a/0000_top_stats.yaml: 401770cff804c6e51b78dff61390d8b5977598a2b09c6601ac593653e912535b +src/rocprof_compute_soc/analysis_configs/gfx940/0000_top_stats.yaml: 401770cff804c6e51b78dff61390d8b5977598a2b09c6601ac593653e912535b +src/rocprof_compute_soc/analysis_configs/gfx941/0000_top_stats.yaml: 401770cff804c6e51b78dff61390d8b5977598a2b09c6601ac593653e912535b +src/rocprof_compute_soc/analysis_configs/gfx942/0000_top_stats.yaml: 401770cff804c6e51b78dff61390d8b5977598a2b09c6601ac593653e912535b +src/rocprof_compute_soc/analysis_configs/gfx950/0000_top_stats.yaml: 401770cff804c6e51b78dff61390d8b5977598a2b09c6601ac593653e912535b +src/rocprof_compute_soc/analysis_configs/gfx908/0100_system_info.yaml: 739e39e69056984c277a69c17a6866effa860f56e8b1d3ea5d625582f16228ef +src/rocprof_compute_soc/analysis_configs/gfx90a/0100_system_info.yaml: 739e39e69056984c277a69c17a6866effa860f56e8b1d3ea5d625582f16228ef +src/rocprof_compute_soc/analysis_configs/gfx940/0100_system_info.yaml: 739e39e69056984c277a69c17a6866effa860f56e8b1d3ea5d625582f16228ef +src/rocprof_compute_soc/analysis_configs/gfx941/0100_system_info.yaml: 739e39e69056984c277a69c17a6866effa860f56e8b1d3ea5d625582f16228ef +src/rocprof_compute_soc/analysis_configs/gfx942/0100_system_info.yaml: 739e39e69056984c277a69c17a6866effa860f56e8b1d3ea5d625582f16228ef +src/rocprof_compute_soc/analysis_configs/gfx950/0100_system_info.yaml: 739e39e69056984c277a69c17a6866effa860f56e8b1d3ea5d625582f16228ef +src/rocprof_compute_soc/analysis_configs/gfx908/0200_system_speed_of_light.yaml: 383f51bf243980df626dacd34c26844b397e4093988524f91e3c7a9a3b8bf063 +src/rocprof_compute_soc/analysis_configs/gfx90a/0200_system_speed_of_light.yaml: e9f552ee72849dc9c4ab14fee77ecc2681f4bcf610a8649c55365ab7eea7aafc +src/rocprof_compute_soc/analysis_configs/gfx940/0200_system_speed_of_light.yaml: 70716745e727d3a7e6fa706d34c346f796c241c485516da52e0c694386b3cf57 +src/rocprof_compute_soc/analysis_configs/gfx941/0200_system_speed_of_light.yaml: a1d4f1f712755f6369d3a350eadcd5b0fcd90b5c0cab8be691c24bb860d90ba5 +src/rocprof_compute_soc/analysis_configs/gfx942/0200_system_speed_of_light.yaml: 70716745e727d3a7e6fa706d34c346f796c241c485516da52e0c694386b3cf57 +src/rocprof_compute_soc/analysis_configs/gfx950/0200_system_speed_of_light.yaml: a2cb003c74c0a75b9fe690da4e21b46e78fdb2f3233fc4753bca9276e93d60b0 +src/rocprof_compute_soc/analysis_configs/gfx908/0300_memory_chart.yaml: 6e008d397d9f364d6cb5fdd5a7974e4d372654a583d3e30d8bb8796f97b9b211 +src/rocprof_compute_soc/analysis_configs/gfx90a/0300_memory_chart.yaml: cbb3c841b1ad8cbb23a071fcc145dedabb5341d36054c188c9f61878632fd664 +src/rocprof_compute_soc/analysis_configs/gfx940/0300_memory_chart.yaml: f3c235b5c9ef06c837c04689fc1f413d1137360795ffccfc0256b40769c926c6 +src/rocprof_compute_soc/analysis_configs/gfx941/0300_memory_chart.yaml: f3c235b5c9ef06c837c04689fc1f413d1137360795ffccfc0256b40769c926c6 +src/rocprof_compute_soc/analysis_configs/gfx942/0300_memory_chart.yaml: f3c235b5c9ef06c837c04689fc1f413d1137360795ffccfc0256b40769c926c6 +src/rocprof_compute_soc/analysis_configs/gfx950/0300_memory_chart.yaml: deeb8671c4628aaad3046391975c9c62547460dcf18eb179ef7dc6b2729919e2 +src/rocprof_compute_soc/analysis_configs/gfx908/0400_roofline.yaml: 57f95dcd487dfcdf24e1c2d8eb16d14dc3462df83d11a08e7de2b06343b48c3e +src/rocprof_compute_soc/analysis_configs/gfx90a/0400_roofline.yaml: 57f95dcd487dfcdf24e1c2d8eb16d14dc3462df83d11a08e7de2b06343b48c3e +src/rocprof_compute_soc/analysis_configs/gfx940/0400_roofline.yaml: 57f95dcd487dfcdf24e1c2d8eb16d14dc3462df83d11a08e7de2b06343b48c3e +src/rocprof_compute_soc/analysis_configs/gfx941/0400_roofline.yaml: 57f95dcd487dfcdf24e1c2d8eb16d14dc3462df83d11a08e7de2b06343b48c3e +src/rocprof_compute_soc/analysis_configs/gfx942/0400_roofline.yaml: 57f95dcd487dfcdf24e1c2d8eb16d14dc3462df83d11a08e7de2b06343b48c3e +src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline.yaml: 57f95dcd487dfcdf24e1c2d8eb16d14dc3462df83d11a08e7de2b06343b48c3e +src/rocprof_compute_soc/analysis_configs/gfx908/0500_command_processor_cpc_cpf.yaml: da1c2997d42d66da2aa069caa741cf9eade124357c56e4290333de2f3e0412bb +src/rocprof_compute_soc/analysis_configs/gfx90a/0500_command_processor_cpc_cpf.yaml: da1c2997d42d66da2aa069caa741cf9eade124357c56e4290333de2f3e0412bb +src/rocprof_compute_soc/analysis_configs/gfx940/0500_command_processor_cpc_cpf.yaml: da1c2997d42d66da2aa069caa741cf9eade124357c56e4290333de2f3e0412bb +src/rocprof_compute_soc/analysis_configs/gfx941/0500_command_processor_cpc_cpf.yaml: da1c2997d42d66da2aa069caa741cf9eade124357c56e4290333de2f3e0412bb +src/rocprof_compute_soc/analysis_configs/gfx942/0500_command_processor_cpc_cpf.yaml: da1c2997d42d66da2aa069caa741cf9eade124357c56e4290333de2f3e0412bb +src/rocprof_compute_soc/analysis_configs/gfx950/0500_command_processor_cpc_cpf.yaml: 6eb8acab3abb4183868470a4bd8ee97bf8a426f5faeca46aab0d9000c1700f76 +src/rocprof_compute_soc/analysis_configs/gfx908/0600_workgroup_manager_spi.yaml: a866f2dc5e30404a914f0cfa10f73fc2463007ae553c4655b8e47bcdbd76f8ac +src/rocprof_compute_soc/analysis_configs/gfx90a/0600_workgroup_manager_spi.yaml: a866f2dc5e30404a914f0cfa10f73fc2463007ae553c4655b8e47bcdbd76f8ac +src/rocprof_compute_soc/analysis_configs/gfx940/0600_workgroup_manager_spi.yaml: a866f2dc5e30404a914f0cfa10f73fc2463007ae553c4655b8e47bcdbd76f8ac +src/rocprof_compute_soc/analysis_configs/gfx941/0600_workgroup_manager_spi.yaml: a866f2dc5e30404a914f0cfa10f73fc2463007ae553c4655b8e47bcdbd76f8ac +src/rocprof_compute_soc/analysis_configs/gfx942/0600_workgroup_manager_spi.yaml: a866f2dc5e30404a914f0cfa10f73fc2463007ae553c4655b8e47bcdbd76f8ac +src/rocprof_compute_soc/analysis_configs/gfx950/0600_workgroup_manager_spi.yaml: 7fe5d39165fd1100de7f89639cf6b8b1ffdcba46f86063d2040bee3bc14dc032 +src/rocprof_compute_soc/analysis_configs/gfx908/0700_wavefront.yaml: 1d22ca4540dbc884ede6d9071e7a2e0a1cd831d4eb2da6f29ad8a582907df848 +src/rocprof_compute_soc/analysis_configs/gfx90a/0700_wavefront.yaml: 1d22ca4540dbc884ede6d9071e7a2e0a1cd831d4eb2da6f29ad8a582907df848 +src/rocprof_compute_soc/analysis_configs/gfx940/0700_wavefront.yaml: 1d22ca4540dbc884ede6d9071e7a2e0a1cd831d4eb2da6f29ad8a582907df848 +src/rocprof_compute_soc/analysis_configs/gfx941/0700_wavefront.yaml: 1d22ca4540dbc884ede6d9071e7a2e0a1cd831d4eb2da6f29ad8a582907df848 +src/rocprof_compute_soc/analysis_configs/gfx942/0700_wavefront.yaml: 1d22ca4540dbc884ede6d9071e7a2e0a1cd831d4eb2da6f29ad8a582907df848 +src/rocprof_compute_soc/analysis_configs/gfx950/0700_wavefront.yaml: 9ce451e4e9099bb5e43e6e41e5621b469d849f1e4900a74f156337eed95b644d +src/rocprof_compute_soc/analysis_configs/gfx908/1000_compute_units_instruction_mix.yaml: 4f70eb28dad079098fcc97813c59b02dc1bda06ceb5f7806a94b3b26184e47af +src/rocprof_compute_soc/analysis_configs/gfx90a/1000_compute_units_instruction_mix.yaml: 84eee8712ebd101e593598098bd6f9e281b36f116d0f3eba6a415c418dbbb647 +src/rocprof_compute_soc/analysis_configs/gfx940/1000_compute_units_instruction_mix.yaml: 7d9ae6b30157645b0461abaf84aa9c793c87ed630a8a6611a34ae043cbcc4c5d +src/rocprof_compute_soc/analysis_configs/gfx941/1000_compute_units_instruction_mix.yaml: 7d9ae6b30157645b0461abaf84aa9c793c87ed630a8a6611a34ae043cbcc4c5d +src/rocprof_compute_soc/analysis_configs/gfx942/1000_compute_units_instruction_mix.yaml: 7d9ae6b30157645b0461abaf84aa9c793c87ed630a8a6611a34ae043cbcc4c5d +src/rocprof_compute_soc/analysis_configs/gfx950/1000_compute_units_instruction_mix.yaml: a34553f977577980312b27005bfcd9c1e4c79f77c0c3dc4e023a17bf86169373 +src/rocprof_compute_soc/analysis_configs/gfx908/1100_compute_units_compute_pipeline.yaml: 472d6f872fb9f545940899824f87f88d4f7f7544ae11addd10da08ced0110f49 +src/rocprof_compute_soc/analysis_configs/gfx90a/1100_compute_units_compute_pipeline.yaml: 2eeac2474dce7ff3b03650575dd7ce92458db8f70a7958536ada892119d33c69 +src/rocprof_compute_soc/analysis_configs/gfx940/1100_compute_units_compute_pipeline.yaml: 4a25b6abf24f4a622fde1a3cfe65fe7236cf1e626fc2444667883997564cea1e +src/rocprof_compute_soc/analysis_configs/gfx941/1100_compute_units_compute_pipeline.yaml: 4a25b6abf24f4a622fde1a3cfe65fe7236cf1e626fc2444667883997564cea1e +src/rocprof_compute_soc/analysis_configs/gfx942/1100_compute_units_compute_pipeline.yaml: 4a25b6abf24f4a622fde1a3cfe65fe7236cf1e626fc2444667883997564cea1e +src/rocprof_compute_soc/analysis_configs/gfx950/1100_compute_units_compute_pipeline.yaml: 4ef656938f8a9667ae872db522855856469accff9cb42bc0444b469346760dfd +src/rocprof_compute_soc/analysis_configs/gfx908/1200_local_data_share_lds.yaml: 80f3ca3ea15de009c5278ea20566d8c08d62e0087971e5f9aeae1c89df1dd898 +src/rocprof_compute_soc/analysis_configs/gfx90a/1200_local_data_share_lds.yaml: 80f3ca3ea15de009c5278ea20566d8c08d62e0087971e5f9aeae1c89df1dd898 +src/rocprof_compute_soc/analysis_configs/gfx940/1200_local_data_share_lds.yaml: 3bbf3928288990863cfe72fd00a28785fde0a36f103f5381df578aae2eb28be0 +src/rocprof_compute_soc/analysis_configs/gfx941/1200_local_data_share_lds.yaml: 3bbf3928288990863cfe72fd00a28785fde0a36f103f5381df578aae2eb28be0 +src/rocprof_compute_soc/analysis_configs/gfx942/1200_local_data_share_lds.yaml: 3bbf3928288990863cfe72fd00a28785fde0a36f103f5381df578aae2eb28be0 +src/rocprof_compute_soc/analysis_configs/gfx950/1200_local_data_share_lds.yaml: 505163510a3b0132ee487f9e024188de2deb97d0f72e3d729b95f86e7c3434b3 +src/rocprof_compute_soc/analysis_configs/gfx908/1300_instruction_cache.yaml: 2437e2f8191675c4116d0da1db291f3ad2715281ea812e9fdd6506cf213e5d1b +src/rocprof_compute_soc/analysis_configs/gfx90a/1300_instruction_cache.yaml: 2437e2f8191675c4116d0da1db291f3ad2715281ea812e9fdd6506cf213e5d1b +src/rocprof_compute_soc/analysis_configs/gfx940/1300_instruction_cache.yaml: 2437e2f8191675c4116d0da1db291f3ad2715281ea812e9fdd6506cf213e5d1b +src/rocprof_compute_soc/analysis_configs/gfx941/1300_instruction_cache.yaml: 2437e2f8191675c4116d0da1db291f3ad2715281ea812e9fdd6506cf213e5d1b +src/rocprof_compute_soc/analysis_configs/gfx942/1300_instruction_cache.yaml: 2437e2f8191675c4116d0da1db291f3ad2715281ea812e9fdd6506cf213e5d1b +src/rocprof_compute_soc/analysis_configs/gfx950/1300_instruction_cache.yaml: 2437e2f8191675c4116d0da1db291f3ad2715281ea812e9fdd6506cf213e5d1b +src/rocprof_compute_soc/analysis_configs/gfx908/1400_scalar_l1_data_cache.yaml: 8871e3b65132321cb3880a48f894d8c3b2c56a3936d382c3c2b02723ed5c8ec5 +src/rocprof_compute_soc/analysis_configs/gfx90a/1400_scalar_l1_data_cache.yaml: 8871e3b65132321cb3880a48f894d8c3b2c56a3936d382c3c2b02723ed5c8ec5 +src/rocprof_compute_soc/analysis_configs/gfx940/1400_scalar_l1_data_cache.yaml: 8871e3b65132321cb3880a48f894d8c3b2c56a3936d382c3c2b02723ed5c8ec5 +src/rocprof_compute_soc/analysis_configs/gfx941/1400_scalar_l1_data_cache.yaml: 8871e3b65132321cb3880a48f894d8c3b2c56a3936d382c3c2b02723ed5c8ec5 +src/rocprof_compute_soc/analysis_configs/gfx942/1400_scalar_l1_data_cache.yaml: 8871e3b65132321cb3880a48f894d8c3b2c56a3936d382c3c2b02723ed5c8ec5 +src/rocprof_compute_soc/analysis_configs/gfx950/1400_scalar_l1_data_cache.yaml: 8871e3b65132321cb3880a48f894d8c3b2c56a3936d382c3c2b02723ed5c8ec5 +src/rocprof_compute_soc/analysis_configs/gfx908/1500_address_processing_unit_and_data_return_path_ta_td.yaml: 231f9b7c09266c4aac50ac4db1b055c36eb6e563ba713c5f3aa30508d03b9170 +src/rocprof_compute_soc/analysis_configs/gfx90a/1500_address_processing_unit_and_data_return_path_ta_td.yaml: eb1ec287cc1f9f133b80fdde072a2b86e819f96ccdf4c305e721f3466d37b156 +src/rocprof_compute_soc/analysis_configs/gfx940/1500_address_processing_unit_and_data_return_path_ta_td.yaml: 52ae21cec4ce4990e966d7fb438ac02b7e63ad4bc428f9770cd2c08d80f712da +src/rocprof_compute_soc/analysis_configs/gfx941/1500_address_processing_unit_and_data_return_path_ta_td.yaml: 52ae21cec4ce4990e966d7fb438ac02b7e63ad4bc428f9770cd2c08d80f712da +src/rocprof_compute_soc/analysis_configs/gfx942/1500_address_processing_unit_and_data_return_path_ta_td.yaml: 52ae21cec4ce4990e966d7fb438ac02b7e63ad4bc428f9770cd2c08d80f712da +src/rocprof_compute_soc/analysis_configs/gfx950/1500_address_processing_unit_and_data_return_path_ta_td.yaml: f7b032202e1aea6befda0d62e3d9f04b846f473218bd62e90d59a34678b62a77 +src/rocprof_compute_soc/analysis_configs/gfx908/1600_vector_l1_data_cache.yaml: e6ec43014ce7b7cc072385d4eba072dd187b5de14979c169a3c1e9b8fc4c2762 +src/rocprof_compute_soc/analysis_configs/gfx90a/1600_vector_l1_data_cache.yaml: e6ec43014ce7b7cc072385d4eba072dd187b5de14979c169a3c1e9b8fc4c2762 +src/rocprof_compute_soc/analysis_configs/gfx940/1600_vector_l1_data_cache.yaml: 0e53921cc8d87a9adade250b9632fa42d33c825565152e37d6e56f45f83a3a28 +src/rocprof_compute_soc/analysis_configs/gfx941/1600_vector_l1_data_cache.yaml: 0e53921cc8d87a9adade250b9632fa42d33c825565152e37d6e56f45f83a3a28 +src/rocprof_compute_soc/analysis_configs/gfx942/1600_vector_l1_data_cache.yaml: 0e53921cc8d87a9adade250b9632fa42d33c825565152e37d6e56f45f83a3a28 +src/rocprof_compute_soc/analysis_configs/gfx950/1600_vector_l1_data_cache.yaml: cd21327c193d2af8c18066b9c13f67e3d5dfb44731777bc5a1b6a7738c902dd1 +src/rocprof_compute_soc/analysis_configs/gfx908/1700_l2_cache.yaml: 6aeda249093c666000b104f8631b4a85698e083dd55e77e1e1f095f222054742 +src/rocprof_compute_soc/analysis_configs/gfx90a/1700_l2_cache.yaml: a4ec667e0b827c046de207416d185dd528f030f29bdee162a2634e579bb31846 +src/rocprof_compute_soc/analysis_configs/gfx940/1700_l2_cache.yaml: a9ac811e491fce354aef029b11a96edb589535e84224fa2e2b323623e9fd6e00 +src/rocprof_compute_soc/analysis_configs/gfx941/1700_l2_cache.yaml: 7d925c3369b366c23e638ca2b3d074672324a5b9fd0fa586a3e71dee458743a6 +src/rocprof_compute_soc/analysis_configs/gfx942/1700_l2_cache.yaml: 7532dc55c28c809f435f5edae98632a2d99adc898b2b71a661e2c9696f674f4a +src/rocprof_compute_soc/analysis_configs/gfx950/1700_l2_cache.yaml: a9f3146a99e74eaba5327be3cdf9361fb8b69d1640751fb05519e44dd2ec7292 +src/rocprof_compute_soc/analysis_configs/gfx908/1800_l2_cache_per_channel.yaml: a0c53202fe9f68d5e1fa689ce0643c471ced7d47e007d8ccc68fba294f7f6a05 +src/rocprof_compute_soc/analysis_configs/gfx90a/1800_l2_cache_per_channel.yaml: a0c53202fe9f68d5e1fa689ce0643c471ced7d47e007d8ccc68fba294f7f6a05 +src/rocprof_compute_soc/analysis_configs/gfx940/1800_l2_cache_per_channel.yaml: e184e3692eb0d641fb2e37fada0e58a6c4958553931d7c038b884e1e6986093f +src/rocprof_compute_soc/analysis_configs/gfx941/1800_l2_cache_per_channel.yaml: e184e3692eb0d641fb2e37fada0e58a6c4958553931d7c038b884e1e6986093f +src/rocprof_compute_soc/analysis_configs/gfx942/1800_l2_cache_per_channel.yaml: e184e3692eb0d641fb2e37fada0e58a6c4958553931d7c038b884e1e6986093f +src/rocprof_compute_soc/analysis_configs/gfx950/1800_l2_cache_per_channel.yaml: 896d9af08778c5ecddc6d6961ae96b972a739c913ed9143e3f5fb2f7e878cb5e +src/rocprof_compute_soc/analysis_configs/gfx908/2100_pc_sampling.yaml: 4f3af55040c40bee5f1fd88d83e2324d06e5dc462c0adc3e6d5b19b3f31af5e7 +src/rocprof_compute_soc/analysis_configs/gfx90a/2100_pc_sampling.yaml: 4f3af55040c40bee5f1fd88d83e2324d06e5dc462c0adc3e6d5b19b3f31af5e7 +src/rocprof_compute_soc/analysis_configs/gfx940/2100_pc_sampling.yaml: 4f3af55040c40bee5f1fd88d83e2324d06e5dc462c0adc3e6d5b19b3f31af5e7 +src/rocprof_compute_soc/analysis_configs/gfx941/2100_pc_sampling.yaml: 4f3af55040c40bee5f1fd88d83e2324d06e5dc462c0adc3e6d5b19b3f31af5e7 +src/rocprof_compute_soc/analysis_configs/gfx942/2100_pc_sampling.yaml: 4f3af55040c40bee5f1fd88d83e2324d06e5dc462c0adc3e6d5b19b3f31af5e7 +src/rocprof_compute_soc/analysis_configs/gfx950/2100_pc_sampling.yaml: 4f3af55040c40bee5f1fd88d83e2324d06e5dc462c0adc3e6d5b19b3f31af5e7 +docs/data/metrics_description.yaml: 69bd9c4121e13bdda6af2dead3129a46569f37fd1c59b20f45c85593824522d2 diff --git a/projects/rocprofiler-compute/utils/split_config.py b/projects/rocprofiler-compute/utils/split_config.py new file mode 100644 index 0000000000..52bbef8b0c --- /dev/null +++ b/projects/rocprofiler-compute/utils/split_config.py @@ -0,0 +1,160 @@ +# NOTES +# +# Read utils/unified_config.yaml and split it into per gfx architecture per panel config files +# WARNING: This script will overwrite existing files under per gfx architecture folders under src/rocprof_compute_soc/analysis_configs +# +# Read utils/unified_config.yaml and split it into metric tables per documentation section +# WARNING: This script will overwrite existing docs/data/metrics_description.yaml + +import hashlib +import re +import copy +from pathlib import Path + +import yaml + +# Get root directory of the project +ROOT_DIR = Path(__file__).parent.parent +SOURCE_DIR = ROOT_DIR.joinpath("utils") +TARGET_DIR = ROOT_DIR.joinpath("src", "rocprof_compute_soc", "analysis_configs") +DOC_TARGET_DIR = ROOT_DIR.joinpath("docs", "data") +AUTOGEN_TEXT = "# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py\n" +HASH_FILE = ROOT_DIR.joinpath("utils", "autogen_hash.yaml") +HASH_FILE_MAP = {} +GFX_VERSIONS = ["gfx908", "gfx90a", "gfx940", "gfx941", "gfx942", "gfx950"] + + +def update_analysis_config(): + # Read the unified config file + with open(SOURCE_DIR.joinpath("unified_config.yaml")) as file: + unified_config = yaml.safe_load(file) + + # Create per panel config file + for panel_config in unified_config["panels"]: + new_panel_config = {"Panel Config": {}} + new_panel_config["Panel Config"]["id"] = panel_config["id"] + new_panel_config["Panel Config"]["title"] = panel_config["title"] + new_panel_config["Panel Config"]["metrics_description"] = {key: value["plain"] for key, value in panel_config.get("metrics_description", {}).items()} + # Convert int into str with 4 digits + panel_id = str(panel_config["id"]).zfill(4) + # Replace parentehsis, hyphen, slash and space with underscore + # Remove duplicate underscore + # Convert to lower case + panel_title = re.sub(r"[()\-/ ]+", "_", panel_config["title"]) + panel_title = "_".join(filter(None, panel_title.split("_"))) + panel_title = panel_title.lower() + + for gfx_version in GFX_VERSIONS: + # Create per gfx architecture folder + gfx_dir = TARGET_DIR.joinpath(gfx_version) + # Create directory if it doesn't exist + if not gfx_dir.exists(): + gfx_dir.mkdir() + print(f"Created directory: {gfx_dir}") + + # Select metrics from current gfx arch + new_panel_config["Panel Config"]["data source"] = [] + for data_source_config in panel_config["data source"]: + data_source_config = copy.deepcopy(data_source_config) + if "metric_table" in data_source_config: + data_source_config["metric_table"]["metric"] = data_source_config["metric_table"]["metric"][gfx_version] + new_panel_config["Panel Config"]["data source"].append(data_source_config) + # Write panel config to file + filename = Path( + TARGET_DIR.joinpath(gfx_version, f"{panel_id}_{panel_title}.yaml") + ) + with open(filename, "w") as file: + file.write(AUTOGEN_TEXT) + yaml.dump(new_panel_config, file, sort_keys=False) + print(f"File write: {filename}") + # Calculate hash of filename + HASH_FILE_MAP[str(filename.relative_to(ROOT_DIR))] = hashlib.sha256( + filename.read_bytes() + ).hexdigest() + + +def update_documentation(): + # Documentation sections + section_panel_map = { + "Wavefront launch stats": 701, + "Wavefront runtime stats": 702, + "Overall instruction mix": 1001, + "VALU arithmetic instruction mix": 1002, + "MFMA instruction mix": 1004, + "Compute Speed-of-Light": 1101, + "Pipeline statistics": 1102, + "Arithmetic operations": 1103, + "LDS Speed-of-Light": 1201, + "LDS Statistics": 1202, + "vL1D Speed-of-Light": 1601, + "Busy / stall metrics": 1501, + "Instruction counts": 1502, + "Spill / stack metrics": 1503, + "L1 Unified Translation Cache (UTCL1)": 1605, + "vL1D cache stall metrics": 1602, + "vL1D cache access metrics": 1603, + "Vector L1 data-return path or Texture Data (TD)": 1504, + "L2 Speed-of-Light": 1701, + "L2 cache accesses": 1703, + "L2-Fabric interface metrics": 1702, + "L2 - Fabric interface detailed metrics": 1706, + "L2 - Fabric Interface stalls": 1705, + "Scalar L1D Speed-of-Light": 1401, + "Scalar L1D cache accesses": 1402, + "Scalar L1D Cache - L2 Interface": 1403, + "L1I Speed-of-Light": 1301, + "L1I cache accesses": 1302, + "L1I <-> L2 interface": 1303, + "Workgroup manager utilizations": 601, + "Workgroup Manager - Resource Allocation": 602, + "Command processor fetcher (CPF)": 501, + "Command processor packet processor (CPC)": 502, + "System Speed-of-Light": 201, + } + + # Read the unified config file + with open(SOURCE_DIR.joinpath("unified_config.yaml")) as file: + unified_config = yaml.safe_load(file) + + panel_metric_map = {} + for panel_config in unified_config["panels"]: + for data_source in panel_config["data source"]: + if "metric_table" in data_source: + metrics_info = {} + for key in panel_config["metrics_description"]: + metrics_info[key] = { + "rst": panel_config["metrics_description"][key]["rst"], + "unit": panel_config["metrics_description"][key]["unit"], + } + panel_metric_map[data_source["metric_table"]["id"]] = metrics_info + + # Merge panel_metric_map with section_panel_map + section_metric_map = {} + for section, panel_id in section_panel_map.items(): + if panel_id in panel_metric_map: + section_metric_map[section] = panel_metric_map[panel_id] + + # Write documentation metrics description file + filename = Path(DOC_TARGET_DIR.joinpath("metrics_description.yaml")) + with open(filename, "w") as file: + file.write(AUTOGEN_TEXT) + yaml.dump(section_metric_map, file, sort_keys=False) + print(f"File write: {filename}") + # Calculate hash of filename + HASH_FILE_MAP[str(filename.relative_to(ROOT_DIR))] = hashlib.sha256( + filename.read_bytes() + ).hexdigest() + + +def update_hash(): + # Write hash file + with open(HASH_FILE, "w") as file: + file.write(AUTOGEN_TEXT) + yaml.dump(HASH_FILE_MAP, file, sort_keys=False) + print(f"File write: {HASH_FILE}") + + +if __name__ == "__main__": + update_analysis_config() + update_documentation() + update_hash() diff --git a/projects/rocprofiler-compute/utils/unified_config.yaml b/projects/rocprofiler-compute/utils/unified_config.yaml new file mode 100644 index 0000000000..fbc585e6c8 --- /dev/null +++ b/projects/rocprofiler-compute/utils/unified_config.yaml @@ -0,0 +1,16496 @@ +# NOTE: Please run utils/split_config.py after making changes to this file to auto-generate configs +panels: +- id: 0 + title: Top Stats + data source: + - raw_csv_table: + id: 1 + title: Top Kernels + source: pmc_kernel_top.csv + - raw_csv_table: + id: 2 + title: Dispatch List + source: pmc_dispatch_info.csv +- id: 100 + title: System Info + data source: + - raw_csv_table: + id: 101 + source: sysinfo.csv + columnwise: true +- id: 200 + title: System Speed-of-Light + data source: + - metric_table: + id: 201 + title: System Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + peak: Peak + pop: Pct of Peak + metric: + gfx90a: + VALU FLOPs: + value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) + / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) + VALU IOPs: + value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp + - Start_Timestamp))) + unit: GIOP/s + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / + (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) + * 64) * 2) / 1000)) + MFMA FLOPs (BF16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000)) + MFMA FLOPs (F16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000)) + MFMA FLOPs (F32): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + MFMA FLOPs (F64): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + MFMA IOPs (Int8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GIOP/s + peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000)) + Active CUs: + value: $numActiveCUs + unit: CUs + peak: $cu_per_gpu + pop: ((100 * $numActiveCUs) / $cu_per_gpu) + SALU Utilization: + value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + VALU Utilization: + value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + MFMA Utilization: + value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu) * 4))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu) * 4))) + VMEM Utilization: + value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + unit: pct + peak: 100 + pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + Branch Utilization: + value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / + $cu_per_gpu)) + unit: pct + peak: 100 + pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + VALU Active Threads: + value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + unit: Threads + peak: 64 + pop: (AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) * 1.5625) + IPC: + value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + unit: Instr/cycle + peak: 5 + pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5) + Wavefront Occupancy: + value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + unit: Wavefronts + peak: ($max_waves_per_cu * $cu_per_gpu) + pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu + * $cu_per_gpu)))) + coll_level: SQ_LEVEL_WAVES + Theoretical LDS Bandwidth: + value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: (($max_sclk * $cu_per_gpu) * 0.128) + pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * + 0.00128))) + LDS Bank Conflicts/Access: + value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + unit: Conflicts/access + peak: 32 + pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) / + 32) + vL1D Cache Hit Rate: + value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None)) + unit: pct + peak: 100 + pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None)) + vL1D Cache BW: + value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: ((($max_sclk / 1000) * 64) * $cu_per_gpu) + pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk / 1000) * 64) * $cu_per_gpu)) + L2 Cache Hit Rate: + value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + unit: pct + peak: 100 + pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + L2 Cache BW: + value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan)) + pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) + / ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan))) + L2-Fabric Read BW: + value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) + * 64)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: $hbmBandwidth + pop: ((100 * AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - + TCC_EA_RDREQ_32B_sum) * 64)) / (End_Timestamp - Start_Timestamp)))) + / $hbmBandwidth) + L2-Fabric Write BW: + value: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) + * 32)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: $hbmBandwidth + pop: ((100 * AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - + TCC_EA_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))) + / $hbmBandwidth) + L2-Fabric Read Latency: + value: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + unit: Cycles + peak: None + pop: None + L2-Fabric Write Latency: + value: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + unit: Cycles + peak: None + pop: None + sL1D Cache Hit Rate: + value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) + if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) + unit: pct + peak: 100 + pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) + if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) + sL1D Cache BW: + value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) + unit: GB/s + peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) + pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) + * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) + L1I Hit Rate: + value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) + unit: pct + peak: 100 + pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) + L1I BW: + value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) + unit: GB/s + peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) + pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) + * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) + L1I Fetch Latency: + value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + unit: Cycles + peak: None + pop: None + coll_level: SQ_IFETCH_LEVEL + gfx941: + VALU FLOPs: + value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) + / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) + VALU IOPs: + value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp + - Start_Timestamp))) + unit: GIOP/s + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / + (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) + * 64) * 2) / 1000)) + MFMA FLOPs (F8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + MFMA FLOPs (BF16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) + MFMA FLOPs (F16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) + MFMA FLOPs (F32): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + MFMA FLOPs (F64): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + MFMA IOPs (Int8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GIOP/s + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + Active CUs: + value: $numActiveCUs + unit: CUs + peak: $cu_per_gpu + pop: ((100 * $numActiveCUs) / $cu_per_gpu) + SALU Utilization: + value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + VALU Utilization: + value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + MFMA Utilization: + value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu) * 4))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu) * 4))) + VMEM Utilization: + value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + unit: pct + peak: 100 + pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + Branch Utilization: + value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / + $cu_per_gpu)) + unit: pct + peak: 100 + pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + VALU Active Threads: + value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + unit: Threads + peak: $wave_size + pop: (100 * AVG((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU / $wave_size) + if (SQ_ACTIVE_INST_VALU != 0) else None)) + IPC: + value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + unit: Instr/cycle + peak: 5 + pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5) + Wavefront Occupancy: + value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + unit: Wavefronts + peak: ($max_waves_per_cu * $cu_per_gpu) + pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu + * $cu_per_gpu)))) + coll_level: SQ_LEVEL_WAVES + Theoretical LDS Bandwidth: + value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: (($max_sclk * $cu_per_gpu) * 0.128) + pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * + 0.00128))) + LDS Bank Conflicts/Access: + value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + unit: Conflicts/access + peak: 32 + pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) / + 32) + vL1D Cache Hit Rate: + value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None)) + unit: pct + peak: 100 + pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None)) + vL1D Cache BW: + value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu) + pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) + L2 Cache Hit Rate: + value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + unit: pct + peak: 100 + pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + L2 Cache BW: + value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)) + pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) + / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) + L2-Fabric Read BW: + value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum + - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp + - Start_Timestamp)) + unit: GB/s + peak: $hbmBandwidth + pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum + - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp + - Start_Timestamp)))) / $hbmBandwidth) + L2-Fabric Write BW: + value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: $hbmBandwidth + pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum + - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))) + / $hbmBandwidth) + L2-Fabric Read Latency: + value: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + unit: Cycles + peak: None + pop: None + L2-Fabric Write Latency: + value: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: Cycles + peak: None + pop: None + sL1D Cache Hit Rate: + value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) + if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) + unit: pct + peak: 100 + pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) + if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) + sL1D Cache BW: + value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) + unit: GB/s + peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) + pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) + * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) + L1I Hit Rate: + value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) + unit: pct + peak: 100 + pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) + L1I BW: + value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) + unit: GB/s + peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) + pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) + * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) + L1I Fetch Latency: + value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + unit: Cycles + peak: None + pop: None + coll_level: SQ_IFETCH_LEVEL + gfx940: + VALU FLOPs: + value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) + / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) + VALU IOPs: + value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp + - Start_Timestamp))) + unit: GIOP/s + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / + (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) + * 64) * 2) / 1000)) + MFMA FLOPs (F8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + MFMA FLOPs (BF16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) + MFMA FLOPs (F16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) + MFMA FLOPs (F32): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + MFMA FLOPs (F64): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + MFMA IOPs (Int8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GIOP/s + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + Active CUs: + value: $numActiveCUs + unit: CUs + peak: $cu_per_gpu + pop: ((100 * $numActiveCUs) / $cu_per_gpu) + SALU Utilization: + value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + VALU Utilization: + value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + MFMA Utilization: + value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu) * 4))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu) * 4))) + VMEM Utilization: + value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + unit: pct + peak: 100 + pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + Branch Utilization: + value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / + $cu_per_gpu)) + unit: pct + peak: 100 + pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + VALU Active Threads: + value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + unit: Threads + peak: $wave_size + pop: (100 * AVG((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU / $wave_size) + if (SQ_ACTIVE_INST_VALU != 0) else None)) + IPC: + value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + unit: Instr/cycle + peak: 5 + pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5) + Wavefront Occupancy: + value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + unit: Wavefronts + peak: ($max_waves_per_cu * $cu_per_gpu) + pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu + * $cu_per_gpu)))) + coll_level: SQ_LEVEL_WAVES + Theoretical LDS Bandwidth: + value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: (($max_sclk * $cu_per_gpu) * 0.128) + pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * + 0.00128))) + LDS Bank Conflicts/Access: + value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + unit: Conflicts/access + peak: 32 + pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) / + 32) + vL1D Cache Hit Rate: + value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None)) + unit: pct + peak: 100 + pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None)) + vL1D Cache BW: + value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu) + pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) + L2 Cache Hit Rate: + value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + unit: pct + peak: 100 + pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + L2 Cache BW: + value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)) + pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) + / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) + L2-Fabric Read BW: + value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum + - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp + - Start_Timestamp)) + unit: GB/s + peak: $hbmBandwidth + pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum + - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp + - Start_Timestamp)))) / $hbmBandwidth) + L2-Fabric Write BW: + value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: $hbmBandwidth + pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum + - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))) + / $hbmBandwidth) + L2-Fabric Read Latency: + value: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + unit: Cycles + peak: None + pop: None + L2-Fabric Write Latency: + value: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: Cycles + peak: None + pop: None + sL1D Cache Hit Rate: + value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) + if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) + unit: pct + peak: 100 + pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) + if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) + sL1D Cache BW: + value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) + unit: GB/s + peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) + pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) + * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) + L1I Hit Rate: + value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) + unit: pct + peak: 100 + pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) + L1I BW: + value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) + unit: GB/s + peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) + pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) + * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) + L1I Fetch Latency: + value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + unit: Cycles + peak: None + pop: None + coll_level: SQ_IFETCH_LEVEL + gfx942: + VALU FLOPs: + value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) + / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) + VALU IOPs: + value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp + - Start_Timestamp))) + unit: GIOP/s + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / + (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) + * 64) * 2) / 1000)) + MFMA FLOPs (F8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + MFMA FLOPs (BF16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) + MFMA FLOPs (F16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) + MFMA FLOPs (F32): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + MFMA FLOPs (F64): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + MFMA IOPs (Int8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GIOP/s + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + Active CUs: + value: $numActiveCUs + unit: CUs + peak: $cu_per_gpu + pop: ((100 * $numActiveCUs) / $cu_per_gpu) + SALU Utilization: + value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + VALU Utilization: + value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + MFMA Utilization: + value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu) * 4))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu) * 4))) + VMEM Utilization: + value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + unit: pct + peak: 100 + pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + Branch Utilization: + value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / + $cu_per_gpu)) + unit: pct + peak: 100 + pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + VALU Active Threads: + value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + unit: Threads + peak: $wave_size + pop: (100 * AVG((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU / $wave_size) + if (SQ_ACTIVE_INST_VALU != 0) else None)) + IPC: + value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + unit: Instr/cycle + peak: 5 + pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5) + Wavefront Occupancy: + value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + unit: Wavefronts + peak: ($max_waves_per_cu * $cu_per_gpu) + pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu + * $cu_per_gpu)))) + coll_level: SQ_LEVEL_WAVES + Theoretical LDS Bandwidth: + value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: (($max_sclk * $cu_per_gpu) * 0.128) + pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * + 0.00128))) + LDS Bank Conflicts/Access: + value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + unit: Conflicts/access + peak: 32 + pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) / + 32) + vL1D Cache Hit Rate: + value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None)) + unit: pct + peak: 100 + pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None)) + vL1D Cache BW: + value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu) + pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) + L2 Cache Hit Rate: + value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + unit: pct + peak: 100 + pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + L2 Cache BW: + value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)) + pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) + / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) + L2-Fabric Read BW: + value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum + - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp + - Start_Timestamp)) + unit: GB/s + peak: $hbmBandwidth + pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum + - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp + - Start_Timestamp)))) / $hbmBandwidth) + L2-Fabric Write BW: + value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: $hbmBandwidth + pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum + - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))) + / $hbmBandwidth) + L2-Fabric Read Latency: + value: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + unit: Cycles + peak: None + pop: None + L2-Fabric Write Latency: + value: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: Cycles + peak: None + pop: None + sL1D Cache Hit Rate: + value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) + if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) + unit: pct + peak: 100 + pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) + if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) + sL1D Cache BW: + value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) + unit: GB/s + peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) + pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) + * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) + L1I Hit Rate: + value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) + unit: pct + peak: 100 + pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) + L1I BW: + value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) + unit: GB/s + peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) + pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) + * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) + L1I Fetch Latency: + value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + unit: Cycles + peak: None + pop: None + coll_level: SQ_IFETCH_LEVEL + gfx950: + VALU FLOPs: + value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) + / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) + VALU IOPs: + value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp + - Start_Timestamp))) + unit: GIOP/s + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / + (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) + * 64) * 2) / 1000)) + MFMA FLOPs (F8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) + MFMA FLOPs (BF16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + MFMA FLOPs (F16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + MFMA FLOPs (F32): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + MFMA FLOPs (F64): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000)) + MFMA FLOPs (F6F4): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 16834) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 16834) / 1000)) + MFMA IOPs (Int8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GIOP/s + peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) + Active CUs: + value: $numActiveCUs + unit: CUs + peak: $cu_per_gpu + pop: ((100 * $numActiveCUs) / $cu_per_gpu) + SALU Utilization: + value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + VALU Utilization: + value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + MFMA Utilization: + value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu) * 4))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu) * 4))) + VMEM Utilization: + value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + unit: pct + peak: 100 + pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + Branch Utilization: + value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / + $cu_per_gpu)) + unit: pct + peak: 100 + pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + VALU Active Threads: + value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + unit: Threads + peak: $wave_size + pop: (100 * AVG((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU / $wave_size) + if (SQ_ACTIVE_INST_VALU != 0) else None)) + IPC: + value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + unit: Instr/cycle + peak: 5 + pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5) + Wavefront Occupancy: + value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + unit: Wavefronts + peak: ($max_waves_per_cu * $cu_per_gpu) + pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu + * $cu_per_gpu)))) + coll_level: SQ_LEVEL_WAVES + Theoretical LDS Bandwidth: + value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: (($max_sclk * $cu_per_gpu) * 0.128) + pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * + 0.00128))) + LDS Bank Conflicts/Access: + value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + unit: Conflicts/access + peak: 32 + pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) / + 32) + vL1D Cache Hit Rate: + value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None)) + unit: pct + peak: 100 + pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None)) + vL1D Cache BW: + value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu) + pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) + L2 Cache Hit Rate: + value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + unit: pct + peak: 100 + pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + L2 Cache BW: + value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)) + pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) + / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) + L2-Fabric Read BW: + value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum + - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp + - Start_Timestamp)) + unit: GB/s + peak: $hbmBandwidth + pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum + - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp + - Start_Timestamp)))) / $hbmBandwidth) + L2-Fabric Write BW: + value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: $hbmBandwidth + pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum + - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))) + / $hbmBandwidth) + L2-Fabric Read Latency: + value: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + unit: Cycles + peak: None + pop: None + L2-Fabric Write Latency: + value: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: Cycles + peak: None + pop: None + sL1D Cache Hit Rate: + value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) + if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) + unit: pct + peak: 100 + pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) + if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) + sL1D Cache BW: + value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) + unit: GB/s + peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) + pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) + * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) + L1I Hit Rate: + value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) + unit: pct + peak: 100 + pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) + L1I BW: + value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) + unit: GB/s + peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) + pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) + * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) + L1I Fetch Latency: + value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + unit: Cycles + peak: None + pop: None + coll_level: SQ_IFETCH_LEVEL + gfx908: + VALU FLOPs: + value: None + unit: GFLOP/s + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: None + VALU IOPs: + value: None + unit: GIOP/s + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: None + MFMA FLOPs (BF16): + value: None + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 512) / 1000) + pop: None + MFMA FLOPs (F16): + value: None + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000) + pop: None + MFMA FLOPs (F32): + value: None + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: None + MFMA FLOPs (F64): + value: None + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: None + MFMA IOPs (Int8): + value: None + unit: GIOP/s + peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000) + pop: None + Active CUs: + value: $numActiveCUs + unit: CUs + peak: $cu_per_gpu + pop: ((100 * $numActiveCUs) / $cu_per_gpu) + SALU Utilization: + value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + VALU Utilization: + value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu))) + unit: pct + peak: 100 + pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + MFMA Utilization: + value: None + unit: pct + peak: 100 + pop: None + VMEM Utilization: + value: None + unit: pct + peak: 100 + pop: None + Branch Utilization: + value: None + unit: pct + peak: 100 + pop: None + VALU Active Threads: + value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + unit: Threads + peak: $wave_size + pop: (100 * AVG((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU / $wave_size) + if (SQ_ACTIVE_INST_VALU != 0) else None)) + IPC: + value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + unit: Instr/cycle + peak: 5 + pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5) + Wavefront Occupancy: + value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + unit: Wavefronts + peak: ($max_waves_per_cu * $cu_per_gpu) + pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu + * $cu_per_gpu)))) + coll_level: SQ_LEVEL_WAVES + Theoretical LDS Bandwidth: + value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: (($max_sclk * $cu_per_gpu) * 0.128) + pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * + 0.00128))) + LDS Bank Conflicts/Access: + value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + unit: Conflicts/access + peak: 32 + pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) / + 32) + vL1D Cache Hit Rate: + value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None)) + unit: pct + peak: 100 + pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None)) + vL1D Cache BW: + value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: ((($max_sclk / 1000) * 64) * $cu_per_gpu) + pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk / 1000) * 64) * $cu_per_gpu)) + L2 Cache Hit Rate: + value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + unit: pct + peak: 100 + pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + L2 Cache BW: + value: AVG(((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan)) + pop: ((100 * AVG(((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp)))) + / ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan))) + L2-Fabric Read BW: + value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) + * 64)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: $hbmBandwidth + pop: ((100 * AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - + TCC_EA_RDREQ_32B_sum) * 64)) / (End_Timestamp - Start_Timestamp)))) + / $hbmBandwidth) + L2-Fabric Write BW: + value: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) + * 32)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + peak: $hbmBandwidth + pop: ((100 * AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - + TCC_EA_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))) + / $hbmBandwidth) + L2-Fabric Read Latency: + value: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + unit: Cycles + peak: None + pop: None + L2-Fabric Write Latency: + value: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + unit: Cycles + peak: None + pop: None + sL1D Cache Hit Rate: + value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) + if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) + unit: pct + peak: 100 + pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) + if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) + sL1D Cache BW: + value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) + unit: GB/s + peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) + pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) + * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) + L1I Hit Rate: + value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) + unit: pct + peak: 100 + pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) + L1I BW: + value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) + unit: GB/s + peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) + pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) + * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) + L1I Fetch Latency: + value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + unit: Cycles + peak: None + pop: None + coll_level: SQ_IFETCH_LEVEL + metrics_description: + VALU FLOPs: + plain: 'The total floating-point operations executed per second on the VALU. + This is also presented as a percent of the peak theoretical FLOPs achievable + on the specific accelerator. Note: this does not include any floating-point + operations from MFMA instructions.' + rst: 'The total floating-point operations executed per second on the :ref:`VALU + `. This is also presented as a percent of the peak theoretical + FLOPs achievable on the specific accelerator. Note: this does not include + any floating-point operations from :ref:`MFMA ` instructions.' + unit: GFLOPs + VALU IOPs: + plain: 'The total integer operations executed per second on the VALU. This is + also presented as a percent of the peak theoretical IOPs achievable on the + specific accelerator. Note: this does not include any integer operations from + MFMA instructions.' + rst: 'The total integer operations executed per second on the :ref:`VALU `. + This is also presented as a percent of the peak theoretical IOPs achievable + on the specific accelerator. Note: this does not include any integer operations + from :ref:`MFMA ` instructions.' + unit: GOIPs + MFMA FLOPs (F8): + plain: The total number of 8-bit brain floating point MFMA operations executed + per second. This does not include any 16-bit brain floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + F8 MFMA operations achievable on the specific accelerator. It is supported + on AMD Instinct MI300 series and later only. + rst: 'The total number of 8-bit brain floating point :ref:`MFMA ` + operations executed per second. Note: this does not include any 16-bit brain + floating point operations from :ref:`VALU ` instructions. This + is also presented as a percent of the peak theoretical F8 MFMA operations + achievable on the specific accelerator. It is supported on AMD Instinct MI300 + series and later only.' + unit: GFLOPs + MFMA FLOPs (BF16): + plain: 'The total number of 16-bit brain floating point MFMA operations executed + per second. Note: this does not include any 16-bit brain floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + BF16 MFMA operations achievable on the specific accelerator.' + rst: 'The total number of 16-bit brain floating point :ref:`MFMA ` + operations executed per second. Note: this does not include any 16-bit brain + floating point operations from :ref:`VALU ` instructions. This + is also presented as a percent of the peak theoretical BF16 MFMA operations + achievable on the specific accelerator.' + unit: GFLOPs + MFMA FLOPs (F16): + plain: 'The total number of 16-bit floating point MFMA operations executed per + second. Note: this does not include any 16-bit floating point operations from + VALU instructions. This is also presented as a percent of the peak theoretical + F16 MFMA operations achievable on the specific accelerator.' + rst: 'The total number of 16-bit floating point :ref:`MFMA ` operations + executed per second. Note: this does not include any 16-bit floating point + operations from :ref:`VALU ` instructions. This is also presented + as a percent of the peak theoretical F16 MFMA operations achievable on the + specific accelerator.' + unit: GFLOPs + MFMA FLOPs (F32): + plain: 'The total number of 32-bit floating point MFMA operations executed per + second. Note: this does not include any 32-bit floating point operations from + VALU instructions. This is also presented as a percent of the peak theoretical + F32 MFMA operations achievable on the specific accelerator.' + rst: 'The total number of 32-bit floating point :ref:`MFMA ` operations + executed per second. Note: this does not include any 32-bit floating point + operations from :ref:`VALU ` instructions. This is also presented + as a percent of the peak theoretical F32 MFMA operations achievable on the + specific accelerator.' + unit: GFLOPs + MFMA FLOPs (F64): + plain: 'The total number of 64-bit floating point MFMA operations executed per + second. Note: this does not include any 64-bit floating point operations from + VALU instructions. This is also presented as a percent of the peak theoretical + F64 MFMA operations achievable on the specific accelerator.' + rst: 'The total number of 64-bit floating point :ref:`MFMA ` operations + executed per second. Note: this does not include any 64-bit floating point + operations from :ref:`VALU ` instructions. This is also presented + as a percent of the peak theoretical F64 MFMA operations achievable on the + specific accelerator.' + unit: GFLOPs + MFMA IOPs (Int8): + plain: 'The total number of 8-bit integer MFMA operations executed per second. + Note: this does not include any 8-bit integer operations from VALU instructions. + This is also presented as a percent of the peak theoretical INT8 MFMA operations + achievable on the specific accelerator.' + rst: 'The total number of 8-bit integer :ref:`MFMA ` operations executed + per second. Note: this does not include any 8-bit integer operations from + :ref:`VALU ` instructions. This is also presented as a percent + of the peak theoretical INT8 MFMA operations achievable on the specific accelerator.' + unit: GIOPs + Active CUs: + plain: Total number of active compute units (CUs) on the accelerator during + the kernel execution. + unit: Number + rst: Total number of active compute units (CUs) on the accelerator during the + kernel execution. + SALU Utilization: + plain: Indicates what percent of the kernel's duration the SALU was busy executing + instructions. Computed as the ratio of the total number of cycles spent by + the scheduler issuing SALU or SMEM instructions over the total CU cycles. + rst: Indicates what percent of the kernel's duration the :ref:`SALU ` + was busy executing instructions. Computed as the ratio of the total number + of cycles spent by the :ref:`scheduler ` issuing SALU / :ref:`SMEM + ` instructions over the :ref:`total CU cycles `. + unit: Percent + VALU Utilization: + plain: Indicates what percent of the kernel's duration the VALU was busy executing + instructions. Does not include VMEM operations. Computed as the ratio of the + total number of cycles spent by the scheduler issuing VALU instructions over + the total CU cycles. + rst: Indicates what percent of the kernel's duration the :ref:`VALU ` + was busy executing instructions. Does not include :ref:`VMEM ` + operations. Computed as the ratio of the total number of cycles spent by the + :ref:`scheduler ` issuing VALU instructions over the :ref:`total + CU cycles `. + unit: Percent + MFMA Utilization: + plain: Indicates what percent of the kernel's duration the MFMA unit was busy + executing instructions. Computed as the ratio of the total number of cycles + the MFMA was busy over the total CU cycles. + rst: Indicates what percent of the kernel's duration the :ref:`MFMA ` + unit was busy executing instructions. Computed as the ratio of the total number + of cycles spent by the :ref:`MFMA ` was busy over the :ref:`total + CU cycles `. + unit: Percent + VMEM Utilization: + plain: Indicates what percent of the kernel's duration the VMEM unit was busy + executing instructions, including both global/generic and spill/scratch operations + (see the VMEM instruction count metrics) for more detail). Does not include + VALU operations. Computed as the ratio of the total number of cycles spent + by the scheduler issuing VMEM instructions over the total CU cycles. + rst: Indicates what percent of the kernel's duration the :ref:`VMEM ` + unit was busy executing instructions, including both global/generic and spill/scratch + operations (see the :ref:`VMEM instruction count metrics ` + for more detail). Does not include :ref:`VALU ` operations. Computed + as the ratio of the total number of cycles spent by the :ref:`scheduler ` + issuing VMEM instructions over the :ref:`total CU cycles `. + unit: Percent + Branch Utilization: + plain: Indicates what percent of the kernel's duration the branch unit was busy + executing instructions. Computed as the ratio of the total number of cycles + spent by the scheduler issuing branch instructions over the total CU cycles + rst: Indicates what percent of the kernel's duration the :ref:`branch ` + unit was busy executing instructions. Computed as the ratio of the total number + of cycles spent by the :ref:`scheduler ` issuing branch instructions + over the :ref:`total CU cycles `. + unit: Percent + VALU Active Threads: + plain: Indicates the average level of divergence within a wavefront over the + lifetime of the kernel. The number of work-items that were active in a wavefront + during execution of each VALU instruction, time-averaged over all VALU instructions + run on all wavefronts in the kernel. + rst: Indicates the average level of :ref:`divergence ` within + a wavefront over the lifetime of the kernel. The number of work-items that + were active in a wavefront during execution of each :ref:`VALU ` + instruction, time-averaged over all VALU instructions run on all wavefronts + in the kernel. + unit: Work-items + IPC: + plain: The ratio of the total number of instructions executed on the CU over + the total active CU cycles. This is also presented as a percent of the peak + theoretical bandwidth achievable on the specific accelerator. + rst: The ratio of the total number of instructions executed on the :doc:`CU + ` over the :ref:`total active CU cycles `. + unit: Instructions per-cycle + Wavefront Occupancy: + plain: 'The time-averaged number of wavefronts resident on the accelerator over + the lifetime of the kernel. Note: this metric may be inaccurate for short-running + kernels (less than 1ms). This is also presented as a percent of the peak theoretical + occupancy achievable on the specific accelerator.' + rst: 'The time-averaged number of wavefronts resident on the accelerator over + the lifetime of the kernel. Note: this metric may be inaccurate for short-running + kernels (less than 1ms). This is also presented as a percent of the peak theoretical + occupancy achievable on the specific accelerator.' + unit: Wavefronts + Theoretical LDS Bandwidth: + plain: Indicates the maximum amount of bytes that could have been loaded from, + stored to, or atomically updated in the LDS per unit time (see LDS Bandwidth + example for more detail). This is also presented as a percent of the peak + theoretical F64 MFMA operations achievable on the specific accelerator. + rst: Indicates the maximum amount of bytes that could have been loaded from, + stored to, or atomically updated in the LDS per unit time (see :ref:`LDS Bandwidth + ` example for more detail). This is also presented as a percent + of the peak theoretical F64 MFMA operations achievable on the specific accelerator. + unit: GB/s + LDS Bank Conflicts/Access: + plain: The ratio of the number of cycles spent in the LDS scheduler due to bank + conflicts (as determined by the conflict resolution hardware) to the base + number of cycles that would be spent in the LDS scheduler in a completely + uncontended case. This is also presented in normalized form (i.e., the Bank + Conflict Rate). + rst: The ratio of the number of cycles spent in the :doc:`LDS scheduler ` + due to bank conflicts (as determined by the conflict resolution hardware) + to the base number of cycles that would be spent in the LDS scheduler in + a completely uncontended case. This is also presented in normalized form + (i.e., the Bank Conflict Rate). + unit: Conflicts/Access + vL1D Cache Hit Rate: + plain: The ratio of the number of vL1D cache line requests that hit in vL1D + cache over the total number of cache line requests to the vL1D cache RAM. + rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache + over the total number of cache line requests to the :ref:`vL1D cache RAM + `. + unit: Percent + vL1D Cache BW: + plain: The number of bytes looked up in the vL1D cache as a result of VMEM instructions + per unit time. The number of bytes is calculated as the number of cache lines + requested multiplied by the cache line size. This value does not consider + partial requests, so e.g., if only a single value is requested in a cache + line, the data movement will still be counted as a full cache line. This is + also presented as a percent of the peak theoretical bandwidth achievable on + the specific accelerator. + rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM + ` instructions per unit time. The number of bytes is calculated + as the number of cache lines requested multiplied by the cache line size. + This value does not consider partial requests, so e.g., if only a single + value is requested in a cache line, the data movement will still be counted + as a full cache line. This is also presented as a percent of the peak theoretical + bandwidth achievable on the specific accelerator. + unit: GB/s + L2 Cache Hit Rate: + plain: The ratio of the number of L2 cache line requests that hit in the L2 + cache over the total number of incoming cache line requests to the L2 cache. + rst: The ratio of the number of L2 cache line requests that hit in the L2 cache + over the total number of incoming cache line requests to the L2 cache. + unit: Percent + L2 Cache BW: + plain: The number of bytes looked up in the L2 cache per unit time. The number + of bytes is calculated as the number of cache lines requested multiplied by + the cache line size. This value does not consider partial requests, so e.g., + if only a single value is requested in a cache line, the data movement will + still be counted as a full cache line. This is also presented as a percent + of the peak theoretical bandwidth achievable on the specific accelerator. + rst: The number of bytes looked up in the L2 cache per unit time. The number of + bytes is calculated as the number of cache lines requested multiplied by + the cache line size. This value does not consider partial requests, so e.g., + if only a single value is requested in a cache line, the data movement will + still be counted as a full cache line. This is also presented as a percent + of the peak theoretical bandwidth achievable on the specific accelerator. + unit: GB/s + L2-Fabric Read BW: + plain: "The number of bytes read by the L2 over the Infinity Fabric\u2122 interface\ + \ per unit time. This is also presented as a percent of the peak theoretical\ + \ bandwidth achievable on the specific accelerator." + rst: "The number of bytes read by the L2 over the :ref:`Infinity Fabric\u2122\ + \ interface ` per unit time. This is also presented as a percent\ + \ of the peak theoretical bandwidth achievable on the specific accelerator." + unit: GB/s + L2-Fabric Write BW: + plain: The number of bytes sent by the L2 over the Infinity Fabric interface + by write and atomic operations per unit time. This is also presented as a + percent of the peak theoretical bandwidth achievable on the specific accelerator. + rst: The number of bytes sent by the L2 over the :ref:`Infinity Fabric interface + ` by write and atomic operations per unit time. This is also presented + as a percent of the peak theoretical bandwidth achievable on the specific + accelerator. + unit: GB/s + L2-Fabric Read Latency: + plain: The time-averaged number of cycles read requests spent in Infinity Fabric + before data was returned to the L2. + rst: The time-averaged number of cycles read requests spent in Infinity Fabric before + data was returned to the L2. + unit: Cycles + L2-Fabric Write Latency: + plain: The time-averaged number of cycles write requests spent in Infinity Fabric + before a completion acknowledgement was returned to the L2. + rst: The time-averaged number of cycles write requests spent in Infinity Fabric + before a completion acknowledgement was returned to the L2. + unit: Cycles + sL1D Cache Hit Rate: + plain: The percent of sL1D requests that hit on a previously loaded line the + cache. Calculated as the ratio of the number of sL1D requests that hit over + the number of all sL1D requests. + rst: The percent of sL1D requests that hit on a previously loaded line the cache. + Calculated as the ratio of the number of sL1D requests that hit over the + number of all sL1D requests. + unit: Percent + sL1D Cache BW: + plain: The number of bytes looked up in the sL1D cache per unit time. This is + also presented as a percent of the peak theoretical bandwidth achievable on + the specific accelerator. + rst: The number of bytes looked up in the sL1D cache per unit time. This is also + presented as a percent of the peak theoretical bandwidth achievable on the + specific accelerator. + unit: GB/s + L1I Hit Rate: + plain: The number of bytes looked up in the L1I cache per unit time. This is + also presented as a percent of the peak theoretical bandwidth achievable on + the specific accelerator. + rst: The percent of L1I requests that hit on a previously loaded line the cache. + Calculated as the ratio of the number of L1I requests that hit over the number + of all L1I requests. + unit: GB/s + L1I BW: + plain: The percent of L1I requests that hit on a previously loaded line the + cache. Calculated as the ratio of the number of L1I requests that hit over + the number of all L1I requests. + rst: The number of bytes looked up in the L1I cache per unit time. This is also + presented as a percent of the peak theoretical bandwidth achievable on the + specific accelerator. + unit: Percent + L1I Fetch Latency: + plain: The average number of cycles spent to fetch instructions to a CU. + rst: The average number of cycles spent to fetch instructions to a :doc:`CU + `. + unit: Cycles +- id: 300 + title: Memory Chart + data source: + - metric_table: + id: 301 + title: Memory Chart + header: + metric: Metric + value: Value + metric: + gfx90a: + Wavefront Occupancy: + value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs), + 0) + coll_level: SQ_LEVEL_WAVES + Wave Life: + value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0) + else 0)), 0) + SALU: + value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0) + SMEM: + value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0) + VALU: + value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0) + MFMA: + value: ROUND(AVG((SQ_INSTS_MFMA / $denom)), 0) + VMEM: + value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0) + LDS: + value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) + GWS: + value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0) + BR: + value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0) + Active CUs: + value: $numActiveCUs + Num CUs: + value: $cu_per_gpu + VGPR: + value: ROUND(AVG(Arch_VGPR), 0) + SGPR: + value: ROUND(AVG(SGPR), 0) + LDS Allocation: + value: ROUND(AVG(LDS_Per_Workgroup), 0) + Scratch Allocation: + value: ROUND(AVG(Scratch_Per_Workitem), 0) + Wavefronts: + value: ROUND(AVG(SPI_CSN_WAVE), 0) + Workgroups: + value: ROUND(AVG(SPI_CSN_NUM_THREADGROUPS), 0) + LDS Req: + value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) + LDS Util: + value: ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))), 0) + LDS Latency: + value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS + != 0) else None)),0) + coll_level: SQ_INST_LEVEL_LDS + VL1 Rd: + value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0) + VL1 Wr: + value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0) + VL1 Atomic: + value: ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)), 0) + VL1 Hit: + value: ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None )), 0) + VL1 Lat: + value: ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if + (TCP_TA_TCP_STATE_READ_sum != 0) else None)), 0) + VL1 Coalesce: + value: ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum + * 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0) + VL1 Stall: + value: ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None)), 0) + VL1_L2 Rd: + value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0) + VL1_L2 Wr: + value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0) + VL1_L2 Atomic: + value: ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)), 0) + sL1D Rd: + value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0) + sL1D Hit: + value: ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ + != 0) else None)) * 100), 0) + sL1D Lat: + value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ + != 0) else None)) * 100), 0) + coll_level: SQC_DCACHE_INFLIGHT_LEVEL + sL1D_L2 Rd: + value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0) + sL1D_L2 Wr: + value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0) + sL1D_L2 Atomic: + value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0) + IL1 Fetch: + value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0) + IL1 Hit: + value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0) + IL1 Lat: + value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ + != 0) else None)) * 100), 0) + IL1_L2 Rd: + value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0) + L2 Rd: + value: ROUND(AVG((TCC_READ_sum / $denom)), 0) + L2 Wr: + value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0) + L2 Atomic: + value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0) + L2 Hit: + value: ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) + if ((TCC_HIT_sum + TCC_MISS_sum) != 0) else 0)), 0) + L2 Rd Lat: + value: ROUND(AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + != 0) else None)), 0) + L2 Wr Lat: + value: ROUND(AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) else None)), 0) + Fabric_L2 Rd: + value: ROUND(AVG((TCC_EA_RDREQ_sum / $denom)), 0) + Fabric_L2 Wr: + value: ROUND(AVG((TCC_EA_WRREQ_sum / $denom)), 0) + Fabric_L2 Atomic: + value: ROUND(AVG((TCC_EA_ATOMIC_sum / $denom)), 0) + Fabric Rd Lat: + value: ROUND(AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else 0)), 0) + Fabric Wr Lat: + value: ROUND(AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else 0)), 0) + Fabric Atomic Lat: + value: ROUND(AVG(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum + != 0) else 0)), 0) + HBM Rd: + value: ROUND(AVG((TCC_EA_RDREQ_DRAM_sum / $denom)), 0) + HBM Wr: + value: ROUND(AVG((TCC_EA_WRREQ_DRAM_sum / $denom)), 0) + gfx941: + Wavefront Occupancy: + value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs), + 0) + coll_level: SQ_LEVEL_WAVES + Wave Life: + value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0) + else 0)), 0) + SALU: + value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0) + SMEM: + value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0) + VALU: + value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0) + MFMA: + value: ROUND(AVG((SQ_INSTS_MFMA / $denom)), 0) + VMEM: + value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0) + LDS: + value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) + GWS: + value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0) + BR: + value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0) + Active CUs: + value: $numActiveCUs + Num CUs: + value: $cu_per_gpu + VGPR: + value: ROUND(AVG(Arch_VGPR), 0) + SGPR: + value: ROUND(AVG(SGPR), 0) + LDS Allocation: + value: ROUND(AVG(LDS_Per_Workgroup), 0) + Scratch Allocation: + value: ROUND(AVG(Scratch_Per_Workitem), 0) + Wavefronts: + value: ROUND(AVG(SPI_CSN_WAVE), 0) + Workgroups: + value: ROUND(AVG(SPI_CSN_NUM_THREADGROUPS), 0) + LDS Req: + value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) + LDS Util: + value: ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))), 0) + LDS Latency: + value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS + != 0) else None)),0) + coll_level: SQ_INST_LEVEL_LDS + VL1 Rd: + value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0) + VL1 Wr: + value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0) + VL1 Atomic: + value: ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)), 0) + VL1 Hit: + value: ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None )), 0) + VL1 Lat: + value: ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if + (TCP_TA_TCP_STATE_READ_sum != 0) else None)), 0) + VL1 Coalesce: + value: ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum + * 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0) + VL1 Stall: + value: ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None)), 0) + VL1_L2 Rd: + value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0) + VL1_L2 Wr: + value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0) + VL1_L2 Atomic: + value: ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)), 0) + sL1D Rd: + value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0) + sL1D Hit: + value: ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ + != 0) else None)) * 100), 0) + sL1D Lat: + value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ + != 0) else None)) * 100), 0) + coll_level: SQC_DCACHE_INFLIGHT_LEVEL + sL1D_L2 Rd: + value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0) + sL1D_L2 Wr: + value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0) + sL1D_L2 Atomic: + value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0) + IL1 Fetch: + value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0) + IL1 Hit: + value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0) + IL1 Lat: + value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ + != 0) else None)) * 100), 0) + IL1_L2 Rd: + value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0) + L2 Rd: + value: ROUND(AVG((TCC_READ_sum / $denom)), 0) + L2 Wr: + value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0) + L2 Atomic: + value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0) + L2 Hit: + value: ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) + if ((TCC_HIT_sum + TCC_MISS_sum) != 0) else 0)), 0) + L2 Rd Lat: + value: null + L2 Wr Lat: + value: null + Fabric_L2 Rd: + value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0) + Fabric_L2 Wr: + value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0) + Fabric_L2 Atomic: + value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0) + Fabric Rd Lat: + value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else 0)), 0) + Fabric Wr Lat: + value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else 0)), 0) + Fabric Atomic Lat: + value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else 0)), 0) + HBM Rd: + value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0) + HBM Wr: + value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0) + gfx940: + Wavefront Occupancy: + value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs), + 0) + coll_level: SQ_LEVEL_WAVES + Wave Life: + value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0) + else 0)), 0) + SALU: + value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0) + SMEM: + value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0) + VALU: + value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0) + MFMA: + value: ROUND(AVG((SQ_INSTS_MFMA / $denom)), 0) + VMEM: + value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0) + LDS: + value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) + GWS: + value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0) + BR: + value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0) + Active CUs: + value: $numActiveCUs + Num CUs: + value: $cu_per_gpu + VGPR: + value: ROUND(AVG(Arch_VGPR), 0) + SGPR: + value: ROUND(AVG(SGPR), 0) + LDS Allocation: + value: ROUND(AVG(LDS_Per_Workgroup), 0) + Scratch Allocation: + value: ROUND(AVG(Scratch_Per_Workitem), 0) + Wavefronts: + value: ROUND(AVG(SPI_CSN_WAVE), 0) + Workgroups: + value: ROUND(AVG(SPI_CSN_NUM_THREADGROUPS), 0) + LDS Req: + value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) + LDS Util: + value: ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))), 0) + LDS Latency: + value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS + != 0) else None)),0) + coll_level: SQ_INST_LEVEL_LDS + VL1 Rd: + value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0) + VL1 Wr: + value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0) + VL1 Atomic: + value: ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)), 0) + VL1 Hit: + value: ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None )), 0) + VL1 Lat: + value: ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if + (TCP_TA_TCP_STATE_READ_sum != 0) else None)), 0) + VL1 Coalesce: + value: ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum + * 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0) + VL1 Stall: + value: ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None)), 0) + VL1_L2 Rd: + value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0) + VL1_L2 Wr: + value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0) + VL1_L2 Atomic: + value: ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)), 0) + sL1D Rd: + value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0) + sL1D Hit: + value: ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ + != 0) else None)) * 100), 0) + sL1D Lat: + value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ + != 0) else None)) * 100), 0) + coll_level: SQC_DCACHE_INFLIGHT_LEVEL + sL1D_L2 Rd: + value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0) + sL1D_L2 Wr: + value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0) + sL1D_L2 Atomic: + value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0) + IL1 Fetch: + value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0) + IL1 Hit: + value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0) + IL1 Lat: + value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ + != 0) else None)) * 100), 0) + IL1_L2 Rd: + value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0) + L2 Rd: + value: ROUND(AVG((TCC_READ_sum / $denom)), 0) + L2 Wr: + value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0) + L2 Atomic: + value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0) + L2 Hit: + value: ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) + if ((TCC_HIT_sum + TCC_MISS_sum) != 0) else 0)), 0) + L2 Rd Lat: + value: null + L2 Wr Lat: + value: null + Fabric_L2 Rd: + value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0) + Fabric_L2 Wr: + value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0) + Fabric_L2 Atomic: + value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0) + Fabric Rd Lat: + value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else 0)), 0) + Fabric Wr Lat: + value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else 0)), 0) + Fabric Atomic Lat: + value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else 0)), 0) + HBM Rd: + value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0) + HBM Wr: + value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0) + gfx942: + Wavefront Occupancy: + value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs), + 0) + coll_level: SQ_LEVEL_WAVES + Wave Life: + value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0) + else 0)), 0) + SALU: + value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0) + SMEM: + value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0) + VALU: + value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0) + MFMA: + value: ROUND(AVG((SQ_INSTS_MFMA / $denom)), 0) + VMEM: + value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0) + LDS: + value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) + GWS: + value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0) + BR: + value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0) + Active CUs: + value: $numActiveCUs + Num CUs: + value: $cu_per_gpu + VGPR: + value: ROUND(AVG(Arch_VGPR), 0) + SGPR: + value: ROUND(AVG(SGPR), 0) + LDS Allocation: + value: ROUND(AVG(LDS_Per_Workgroup), 0) + Scratch Allocation: + value: ROUND(AVG(Scratch_Per_Workitem), 0) + Wavefronts: + value: ROUND(AVG(SPI_CSN_WAVE), 0) + Workgroups: + value: ROUND(AVG(SPI_CSN_NUM_THREADGROUPS), 0) + LDS Req: + value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) + LDS Util: + value: ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))), 0) + LDS Latency: + value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS + != 0) else None)),0) + coll_level: SQ_INST_LEVEL_LDS + VL1 Rd: + value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0) + VL1 Wr: + value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0) + VL1 Atomic: + value: ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)), 0) + VL1 Hit: + value: ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None )), 0) + VL1 Lat: + value: ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if + (TCP_TA_TCP_STATE_READ_sum != 0) else None)), 0) + VL1 Coalesce: + value: ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum + * 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0) + VL1 Stall: + value: ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None)), 0) + VL1_L2 Rd: + value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0) + VL1_L2 Wr: + value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0) + VL1_L2 Atomic: + value: ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)), 0) + sL1D Rd: + value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0) + sL1D Hit: + value: ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ + != 0) else None)) * 100), 0) + sL1D Lat: + value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ + != 0) else None)) * 100), 0) + coll_level: SQC_DCACHE_INFLIGHT_LEVEL + sL1D_L2 Rd: + value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0) + sL1D_L2 Wr: + value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0) + sL1D_L2 Atomic: + value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0) + IL1 Fetch: + value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0) + IL1 Hit: + value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0) + IL1 Lat: + value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ + != 0) else None)) * 100), 0) + IL1_L2 Rd: + value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0) + L2 Rd: + value: ROUND(AVG((TCC_READ_sum / $denom)), 0) + L2 Wr: + value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0) + L2 Atomic: + value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0) + L2 Hit: + value: ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) + if ((TCC_HIT_sum + TCC_MISS_sum) != 0) else 0)), 0) + L2 Rd Lat: + value: null + L2 Wr Lat: + value: null + Fabric_L2 Rd: + value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0) + Fabric_L2 Wr: + value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0) + Fabric_L2 Atomic: + value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0) + Fabric Rd Lat: + value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else 0)), 0) + Fabric Wr Lat: + value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else 0)), 0) + Fabric Atomic Lat: + value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else 0)), 0) + HBM Rd: + value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0) + HBM Wr: + value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0) + gfx950: + Wavefront Occupancy: + value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs), + 0) + coll_level: SQ_LEVEL_WAVES + Wave Life: + value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0) + else 0)), 0) + SALU: + value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0) + SMEM: + value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0) + VALU: + value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0) + MFMA: + value: ROUND(AVG((SQ_INSTS_MFMA / $denom)), 0) + VMEM: + value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0) + LDS: + value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) + GWS: + value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0) + BR: + value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0) + Active CUs: + value: $numActiveCUs + Num CUs: + value: $cu_per_gpu + VGPR: + value: ROUND(AVG(Arch_VGPR), 0) + SGPR: + value: ROUND(AVG(SGPR), 0) + LDS Allocation: + value: ROUND(AVG(LDS_Per_Workgroup), 0) + Scratch Allocation: + value: ROUND(AVG(Scratch_Per_Workitem), 0) + Wavefronts: + value: ROUND(AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE), + 0) + Workgroups: + value: ROUND(AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS), 0) + LDS Req: + value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) + LDS Util: + value: ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))), 0) + LDS Latency: + value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS + != 0) else None)),0) + coll_level: SQ_INST_LEVEL_LDS + VL1 Rd: + value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0) + VL1 Wr: + value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0) + VL1 Atomic: + value: ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)), 0) + VL1 Hit: + value: ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None )), 0) + VL1 Lat: + value: ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if + (TCP_TA_TCP_STATE_READ_sum != 0) else None)), 0) + VL1 Coalesce: + value: ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum + * 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0) + VL1 Stall: + value: ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None)), 0) + VL1_L2 Rd: + value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0) + VL1_L2 Wr: + value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0) + VL1_L2 Atomic: + value: ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)), 0) + sL1D Rd: + value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0) + sL1D Hit: + value: ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ + != 0) else None)) * 100), 0) + sL1D Lat: + value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ + != 0) else None)) * 100), 0) + coll_level: SQC_DCACHE_INFLIGHT_LEVEL + sL1D_L2 Rd: + value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0) + sL1D_L2 Wr: + value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0) + sL1D_L2 Atomic: + value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0) + IL1 Fetch: + value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0) + IL1 Hit: + value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0) + IL1 Lat: + value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ + != 0) else None)) * 100), 0) + IL1_L2 Rd: + value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0) + L2 Rd: + value: ROUND(AVG((TCC_READ_sum / $denom)), 0) + L2 Wr: + value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0) + L2 Atomic: + value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0) + L2 Hit: + value: ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) + if ((TCC_HIT_sum + TCC_MISS_sum) != 0) else 0)), 0) + L2 Rd Lat: + value: ROUND(AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + != 0) else None)), 0) + L2 Wr Lat: + value: ROUND(AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) else None)), 0) + Fabric_L2 Rd: + value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0) + Fabric_L2 Wr: + value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0) + Fabric_L2 Atomic: + value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0) + Fabric Rd Lat: + value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else 0)), 0) + Fabric Wr Lat: + value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else 0)), 0) + Fabric Atomic Lat: + value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else 0)), 0) + HBM Rd: + value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0) + HBM Wr: + value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0) + gfx908: + Wavefront Occupancy: + value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs), + 0) + coll_level: SQ_LEVEL_WAVES + Wave Life: + value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0) + else 0)), 0) + SALU: + value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0) + SMEM: + value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0) + VALU: + value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0) + MFMA: + value: None + VMEM: + value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0) + LDS: + value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) + GWS: + value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0) + BR: + value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0) + Active CUs: + value: $numActiveCUs + Num CUs: + value: $cu_per_gpu + VGPR: + value: ROUND(AVG(Arch_VGPR), 0) + SGPR: + value: ROUND(AVG(SGPR), 0) + LDS Allocation: + value: ROUND(AVG(LDS_Per_Workgroup), 0) + Scratch Allocation: + value: ROUND(AVG(Scratch_Per_Workitem), 0) + Wavefronts: + value: ROUND(AVG(SPI_CSN_WAVE), 0) + Workgroups: + value: ROUND(AVG(SPI_CSN_NUM_THREADGROUPS), 0) + LDS Req: + value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) + LDS Util: + value: ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))), 0) + LDS Latency: + value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS + != 0) else None)),0) + coll_level: SQ_INST_LEVEL_LDS + VL1 Rd: + value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0) + VL1 Wr: + value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0) + VL1 Atomic: + value: ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)), 0) + VL1 Hit: + value: ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None )), 0) + VL1 Lat: + value: ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if + (TCP_TA_TCP_STATE_READ_sum != 0) else None)), 0) + VL1 Coalesce: + value: ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum + * 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0) + VL1 Stall: + value: ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None)), 0) + VL1_L2 Rd: + value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0) + VL1_L2 Wr: + value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0) + VL1_L2 Atomic: + value: ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)), 0) + sL1D Rd: + value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0) + sL1D Hit: + value: ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ + != 0) else None)) * 100), 0) + sL1D Lat: + value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ + != 0) else None)) * 100), 0) + coll_level: SQC_DCACHE_INFLIGHT_LEVEL + sL1D_L2 Rd: + value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0) + sL1D_L2 Wr: + value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0) + sL1D_L2 Atomic: + value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0) + IL1 Fetch: + value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0) + IL1 Hit: + value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0) + IL1 Lat: + value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ + != 0) else None)) * 100), 0) + IL1_L2 Rd: + value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0) + L2 Rd: + value: ROUND(AVG((TCC_READ_sum / $denom)), 0) + L2 Wr: + value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0) + L2 Atomic: + value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0) + L2 Hit: + value: ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) + if ((TCC_HIT_sum + TCC_MISS_sum) != 0) else 0)), 0) + L2 Rd Lat: + value: ROUND(AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + != 0) else None)), 0) + L2 Wr Lat: + value: ROUND(AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) else None)), 0) + Fabric_L2 Rd: + value: ROUND(AVG((TCC_EA_RDREQ_sum / $denom)), 0) + Fabric_L2 Wr: + value: ROUND(AVG((TCC_EA_WRREQ_sum / $denom)), 0) + Fabric_L2 Atomic: + value: ROUND(AVG((TCC_EA_ATOMIC_sum / $denom)), 0) + Fabric Rd Lat: + value: ROUND(AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else 0)), 0) + Fabric Wr Lat: + value: ROUND(AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else 0)), 0) + Fabric Atomic Lat: + value: ROUND(AVG(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum + != 0) else 0)), 0) + HBM Rd: + value: ROUND(AVG((TCC_EA_RDREQ_DRAM_sum / $denom)), 0) + HBM Wr: + value: ROUND(AVG((TCC_EA_WRREQ_DRAM_sum / $denom)), 0) + comparable: false + cli_style: mem_chart + tui_style: mem_chart + metrics_description: + Wavefront Occupancy: + plain: Wavefronts per active CU. + rst: Wavefronts per active CU. + unit: Wavefronts + Wave Life: + plain: Average number of cycles executing a wave. + rst: Average number of cycles executing a wave. + unit: Cycles per wave + SALU: + plain: Total Number of SALU (Scalar ALU) instructions issued per normalization + unit. + rst: Total Number of SALU (Scalar ALU) instructions issued per normalization + unit. + unit: Instructions per normalization unit + SMEM: + plain: Total number of SMEM (Scalar Memory Read) instructions issued normalization + unit. + rst: Total number of SMEM (Scalar Memory Read) instructions issued normalization + unit. + unit: Instructions per normalization unit + VALU: + plain: The number of VALU (Vector ALU) instructions issued per normalization + unit. + rst: The number of VALU (Vector ALU) instructions issued per normalization unit. + unit: Instructions per normalization unit + MFMA: + plain: Total number of MFMA (Matrix-Fused-Multiply-Add) instructions issued + per normalization unit. + rst: Total number of MFMA (Matrix-Fused-Multiply-Add) instructions issued per + normalization unit. + unit: Instructions per normalization unit + VMEM: + plain: The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch + memory) per normalization unit. + rst: The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch + memory) per normalization unit. + unit: Instructions per normalization unit + LDS: + plain: The total number of LDS instructions (including, but not limited to, + read/write/atomics and HIP's __shfl instructions) executed per normalization + unit. + rst: The total number of LDS instructions (including, but not limited to, read/write/atomics + and HIP's __shfl instructions) executed per normalization unit. + unit: Instructions per normalization unit + GWS: + plain: Total number of GDS (global data sync) instructions issued per normalization + unit. + rst: Total number of GDS (global data sync) instructions issued per normalization + unit. + unit: Instructions per normalization unit + BR: + plain: Total number of BRANCH instructions issued per normalization unit. + rst: Total number of BRANCH instructions issued per normalization unit. + unit: Instructions per normalization unit + Active CUs: + plain: Total number of active compute units (CUs) on the accelerator during + the kernel execution. + rst: Total number of active compute units (CUs) on the accelerator during the + kernel execution. + unit: CUs + Num CUs: + plain: Total number of compute units (CUs) on the accelerator. + rst: Total number of compute units (CUs) on the accelerator. + unit: CUs + VGPR: + plain: 'The number of architected vector general-purpose registers allocated + for the kernel, see VALU. Note: this may not exactly match the number of VGPRs + requested by the compiler due to allocation granularity.' + rst: 'The number of architected vector general-purpose registers allocated for the + kernel, see :ref:`VALU `. Note: this may not exactly match the + number of VGPRs requested by the compiler due to allocation granularity.' + unit: VGPRs + SGPR: + plain: 'The number of scalar general-purpose registers allocated for the kernel, + see SALU. Note: this may not exactly match the number of SGPRs requested by + the compiler due to allocation granularity.' + rst: 'The number of scalar general-purpose registers allocated for the kernel, see + :ref:`SALU `. Note: this may not exactly match the number of + SGPRs requested by the compiler due to allocation granularity.' + unit: SGPRs + LDS Allocation: + plain: 'The number of bytes of LDS memory (or, shared memory) allocated for + this kernel. Note: This may also be larger than what was requested at compile + time due to both allocation granularity and dynamic per-dispatch LDS allocations.' + rst: 'The number of bytes of :doc:`LDS ` memory (or, shared memory) + allocated for this kernel. Note: This may also be larger than what was requested + at compile time due to both allocation granularity and dynamic per-dispatch + LDS allocations.' + unit: Bytes per workgroup + Scratch Allocation: + plain: The number of bytes of scratch memory requested per work-item for this + kernel. Scratch memory is used for stack memory on the accelerator, as well + as for register spills and restores. + rst: The number of bytes of :ref:`scratch memory ` requested per + work-item for this kernel. Scratch memory is used for stack memory on the + accelerator, as well as for register spills and restores. + unit: Bytes per workgroup + Wavefronts: + plain: The total number of wavefronts, summed over all workgroups, forming this + kernel launch. + rst: The total number of wavefronts, summed over all workgroups, forming this + kernel launch. + unit: Wavefronts + Workgroups: + plain: The total number of workgroups forming this kernel launch. + rst: The total number of workgroups forming this kernel launch. + unit: Workgroups + LDS Req: + plain: The total number of LDS instructions (including, but not limited to, + read/write/atomics and HIP's __shfl instructions) executed per normalization + unit. + rst: The total number of LDS instructions (including, but not limited to, read/write/atomics + and HIP's ``__shfl`` instructions) executed per :ref:`normalization unit + `. + unit: Instructions per normalization unit + LDS Util: + plain: Indicates what percent of the kernel's duration the LDS was actively + executing instructions (including, but not limited to, load, store, atomic + and HIP's __shfl operations). Calculated as the ratio of the total number + of cycles LDS was active over the total CU cycles. + rst: Indicates what percent of the kernel's duration the :ref:`LDS ` was + actively executing instructions (including, but not limited to, load, store, + atomic and HIP's ``__shfl`` operations). Calculated as the ratio of the + total number of cycles LDS was active over the :ref:`total CU cycles `. + unit: Percent + LDS Latency: + plain: The average number of round-trip cycles (i.e., from issue to data-return + / acknowledgment) required for an LDS instruction to complete. + rst: The average number of round-trip cycles (i.e., from issue to data-return / + acknowledgment) required for an LDS instruction to complete. + unit: Cycles + VL1 Rd: + plain: The total number of incoming read requests from the address processing + unit after coalescing per normalization unit + rst: The total number of incoming read requests from the :ref:`address processing + unit ` after coalescing per :ref:`normalization unit ` + unit: Requests per normalization unit + VL1 Wr: + plain: The total number of incoming write requests from the address processing + unit after coalescing per normalization unit + rst: The total number of incoming write requests from the :ref:`address processing + unit ` after coalescing per :ref:`normalization unit ` + unit: Requests per normalization unit + VL1 Atomic: + plain: The total number of incoming atomic requests from the address processing + unit after coalescing per normalization unit + rst: The total number of incoming atomic requests from the :ref:`address processing + unit ` after coalescing per :ref:`normalization unit ` + unit: Requests per normalization unit + VL1 Hit: + plain: The ratio of the number of vL1D cache line requests that hit in vL1D + cache over the total number of cache line requests to the vL1D Cache RAM. + rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache + over the total number of cache line requests to the :ref:`vL1D Cache RAM + `. + unit: Percent + VL1 Lat: + plain: Calculated as the average number of cycles that a vL1D cache line request + spent in the vL1D cache pipeline. + rst: Calculated as the average number of cycles that a vL1D cache line request + spent in the vL1D cache pipeline. + unit: Cycles + VL1 Coalesce: + plain: Indicates how well memory instructions were coalesced by the address + processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). + Calculated as the average number of thread-requests generated per instruction + divided by the ideal number of thread-requests per instruction. + rst: Indicates how well memory instructions were coalesced by the :ref:`address + processing unit `, ranging from uncoalesced (25%) to fully coalesced + (100%). Calculated as the average number of :ref:`thread-requests ` + generated per instruction divided by the ideal number of thread-requests per + instruction. + unit: Percent + VL1 Stall: + plain: The ratio of the number of cycles where the vL1D is stalled waiting to + issue a request for data to the L2 cache divided by the number of cycles where + the vL1D is active. + rst: The ratio of the number of cycles where the vL1D is stalled waiting to issue + a request for data to the :doc:`L2 cache ` divided by the number + of cycles where the vL1D is active [#vl1d-activity]_. + unit: Percent + VL1_L2 Rd: + plain: The number of read requests for a vL1D cache line that were not satisfied + by the vL1D and must be retrieved from the to the L2 Cache per normalization + unit. + rst: The number of read requests for a vL1D cache line that were not satisfied by + the vL1D and must be retrieved from the to the :doc:`L2 Cache ` + per :ref:`normalization unit `. + unit: Requests per normalization unit + VL1_L2 Wr: + plain: The number of write requests to a vL1D cache line that were sent through + the vL1D to the L2 cache, per normalization unit. + rst: The number of write requests to a vL1D cache line that were sent through the + vL1D to the :doc:`L2 cache `, per :ref:`normalization unit `. + unit: Requests per normalization unit + VL1_L2 Atomic: + plain: The number of atomic requests that are sent through the vL1D to the L2 + cache, per normalization unit. This includes requests for atomics with, and + without return. + rst: The number of atomic requests that are sent through the vL1D to the :doc:`L2 + cache `, per :ref:`normalization unit `. This + includes requests for atomics with, and without return. + unit: Requests per normalization unit + sL1D Rd: + plain: The total number of requests, of any size or type, made to the sL1D per + normalization unit. + rst: The total number of requests, of any size or type, made to the sL1D per :ref:`normalization + unit `. + unit: Requests per normalization unit + sL1D Hit: + plain: The total number of sL1D requests that hit on a previously loaded cache + line, per normalization unit. + rst: The total number of sL1D requests that hit on a previously loaded cache line, + per :ref:`normalization unit `. + unit: Requests per normalization unit + sL1D_L2 Rd: + plain: The total number of read requests from sL1D to the L2, per normalization + unit. + rst: The total number of read requests from sL1D to the :doc:`L2 `, per + :ref:`normalization unit `. + unit: Requests per normalization unit + sL1D_L2 Wr: + plain: The total number of write requests from sL1D to the L2, per normalization + unit. Typically unused on current CDNA accelerators. + rst: The total number of write requests from sL1D to the :doc:`L2 `, per + :ref:`normalization unit `. Typically unused on current + CDNA accelerators. + unit: Requests per normalization unit + sL1D_L2 Atomic: + plain: The total number of atomic requests from sL1D to the L2, per normalization + unit. Typically unused on current CDNA accelerators. + rst: The total number of atomic requests from sL1D to the :doc:`L2 `, + per :ref:`normalization unit `. Typically unused on current + CDNA accelerators. + unit: Requests per normalization unit + IL1 Fetch: + plain: The total number of requests made to the L1I per normalization-unit. + rst: The total number of requests made to the L1I per :ref:`normalization-unit + `. + unit: Requests per normalization unit + IL1 Hit: + plain: The percent of L1I requests that hit on a previously loaded line the + cache. Calculated as the ratio of the number of L1I requests that hit over + the number of all L1I requests. + rst: The total number of L1I requests that hit on a previously loaded cache line, + per :ref:`normalization-unit `. + unit: Percent + IL1 Lat: + plain: The average number of cycles spent to fetch instructions to a CU. + rst: The average number of cycles spent to fetch instructions to a :doc:`CU + `. + unit: Cycles + IL1_L2 Rd: + plain: The total number of requests across the L1I - L2 interface per normalization-unit. + rst: The total number of requests across the L1I - L2 interface per normalization-unit. + unit: Requests per normalization unit + L2 Rd: + plain: The total number of read requests to the L2 from all clients. + rst: The total number of read requests to the L2 from all clients. + unit: Requests per normalization unit + L2 Wr: + plain: The total number of write requests to the L2 from all clients. + rst: The total number of write requests to the L2 from all clients. + unit: Requests per normalization unit + L2 Atomic: + plain: The total number of atomic requests (with and without return) to the + L2 from all clients. + rst: The total number of atomic requests (with and without return) to the L2 from + all clients. + unit: Requests per normalization unit + L2 Hit: + plain: The ratio of the number of L2 cache line requests that hit in the L2 + cache over the total number of incoming cache line requests to the L2 cache. + rst: The ratio of the number of L2 cache line requests that hit in the L2 cache + over the total number of incoming cache line requests to the L2 cache. + unit: Percent + L2 Rd Lat: + plain: Calculated as the average number of cycles that the vL1D cache took to + issue and receive read requests from the L2 Cache. This number also includes + requests for atomics with return values. + rst: Calculated as the average number of cycles that the vL1D cache took to issue + and receive read requests from the :doc:`L2 Cache `. This number + also includes requests for atomics with return values. + unit: Cycles + L2 Wr Lat: + plain: Calculated as the average number of cycles that the vL1D cache took to + issue and receive acknowledgement of a write request to the L2 Cache. This + number also includes requests for atomics without return values. + rst: Calculated as the average number of cycles that the vL1D cache took to issue + and receive acknowledgement of a write request to the :doc:`L2 Cache `. + This number also includes requests for atomics without return values. + unit: Cycles + Fabric_L2 Rd: + plain: Number of L2 cache - Infinity Fabric read requests (either 32-byte or + 64-byte) summed over TCC instances per normalization unit. + rst: Number of L2 cache - Infinity Fabric read requests (either 32-byte or 64-byte) + summed over TCC instances per normalization unit. + unit: Requests per normalization unit + Fabric_L2 Wr: + plain: Number of L2 cache - Infinity Fabric write requests (either 32-byte or + 64-byte) summed over TCC instances per normalization unit. + rst: Number of L2 cache - Infinity Fabric write requests (either 32-byte or + 64-byte) summed over TCC instances per normalization unit. + unit: Requests per normalization unit + Fabric_L2 Atomic: + plain: Number of L2 cache - Infinity Fabric write requests (either 32-byte or + 64-byte) that are actually atomic requests summed over TCC instances per normalization + unit. + rst: Number of L2 cache - Infinity Fabric write requests (either 32-byte or + 64-byte) that are actually atomic requests summed over TCC instances per normalization + unit. + unit: Requests per normalization unit + Fabric Rd Lat: + plain: The time-averaged number of cycles read requests spent in Infinity Fabric + before data was returned to the L2. + rst: The time-averaged number of cycles read requests spent in Infinity Fabric + before data was returned to the L2. + unit: Cycles + Fabric Wr Lat: + plain: The time-averaged number of cycles write requests spent in Infinity Fabric + before a completion acknowledgement was returned to the L2. + rst: The time-averaged number of cycles write requests spent in Infinity Fabric + before a completion acknowledgement was returned to the L2. + unit: Cycles + Fabric Atomic Lat: + plain: The time-averaged number of cycles atomic requests spent in Infinity + Fabric before a completion acknowledgement (atomic without return value) or + data (atomic with return value) was returned to the L2. + rst: The time-averaged number of cycles atomic requests spent in Infinity Fabric + before a completion acknowledgement (atomic without return value) or data + (atomic with return value) was returned to the L2. + unit: Cycles + HBM Rd: + plain: The total number of L2 requests to Infinity Fabric to read 32B or 64B + of data from the accelerator's local HBM, per normalization unit. + rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of data + from the accelerator's local HBM, per :ref:`normalization unit `. + See :ref:`l2-request-flow` for more detail. + unit: Requests per normalization unit + HBM Wr: + plain: 'The total number of L2 requests to Infinity Fabric to write or atomically + update 32B or 64B of data in the accelerator''s local HBM, per normalization + unit. ' + rst: The total number of L2 requests to Infinity Fabric to write 32B or 64B + of data from the accelerator's local HBM, per :ref:`normalization unit `. + See :ref:`l2-request-flow` for more detail. + unit: Requests per normalization unit +- id: 400 + title: Roofline + data source: + - None: + id: 401 + title: Roofline +- id: 500 + title: Command Processor (CPC/CPF) + data source: + - metric_table: + id: 501 + title: Command processor fetcher (CPF) + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + gfx90a: + CPF Utilization: + avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) + if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) + min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) + if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) + max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) + if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) + unit: pct + CPF Stall: + avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None)) + min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None)) + max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None)) + unit: pct + CPF-L2 Utilization: + avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) + if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) + min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) + if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) + max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) + if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) + unit: pct + CPF-L2 Stall: + avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY + != 0) else None)) + min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY + != 0) else None)) + max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY + != 0) else None)) + unit: pct + CPF-UTCL1 Stall: + avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) + if (CPF_CPF_STAT_BUSY != 0) else None) + min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) + if (CPF_CPF_STAT_BUSY != 0) else None) + max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) + if (CPF_CPF_STAT_BUSY != 0) else None) + unit: pct + gfx941: + CPF Utilization: + avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) + if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) + min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) + if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) + max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) + if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) + unit: pct + CPF Stall: + avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None)) + min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None)) + max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None)) + unit: pct + CPF-L2 Utilization: + avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) + if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) + min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) + if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) + max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) + if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) + unit: pct + CPF-L2 Stall: + avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY + != 0) else None)) + min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY + != 0) else None)) + max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY + != 0) else None)) + unit: pct + CPF-UTCL1 Stall: + avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) + if (CPF_CPF_STAT_BUSY != 0) else None) + min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) + if (CPF_CPF_STAT_BUSY != 0) else None) + max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) + if (CPF_CPF_STAT_BUSY != 0) else None) + unit: pct + gfx940: + CPF Utilization: + avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) + if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) + min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) + if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) + max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) + if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) + unit: pct + CPF Stall: + avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None)) + min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None)) + max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None)) + unit: pct + CPF-L2 Utilization: + avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) + if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) + min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) + if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) + max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) + if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) + unit: pct + CPF-L2 Stall: + avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY + != 0) else None)) + min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY + != 0) else None)) + max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY + != 0) else None)) + unit: pct + CPF-UTCL1 Stall: + avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) + if (CPF_CPF_STAT_BUSY != 0) else None) + min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) + if (CPF_CPF_STAT_BUSY != 0) else None) + max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) + if (CPF_CPF_STAT_BUSY != 0) else None) + unit: pct + gfx942: + CPF Utilization: + avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) + if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) + min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) + if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) + max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) + if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) + unit: pct + CPF Stall: + avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None)) + min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None)) + max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None)) + unit: pct + CPF-L2 Utilization: + avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) + if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) + min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) + if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) + max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) + if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) + unit: pct + CPF-L2 Stall: + avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY + != 0) else None)) + min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY + != 0) else None)) + max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY + != 0) else None)) + unit: pct + CPF-UTCL1 Stall: + avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) + if (CPF_CPF_STAT_BUSY != 0) else None) + min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) + if (CPF_CPF_STAT_BUSY != 0) else None) + max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) + if (CPF_CPF_STAT_BUSY != 0) else None) + unit: pct + gfx950: + CPF Utilization: + avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) + if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) + min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) + if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) + max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) + if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) + unit: pct + CPF Stall: + avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None)) + min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None)) + max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None)) + unit: pct + CPF-L2 Utilization: + avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) + if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) + min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) + if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) + max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) + if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) + unit: pct + CPF-L2 Stall: + avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY + != 0) else None)) + min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY + != 0) else None)) + max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY + != 0) else None)) + unit: pct + CPF-UTCL1 Stall: + avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) + if (CPF_CPF_STAT_BUSY != 0) else None) + min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) + if (CPF_CPF_STAT_BUSY != 0) else None) + max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) + if (CPF_CPF_STAT_BUSY != 0) else None) + unit: pct + gfx908: + CPF Utilization: + avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) + if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) + min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) + if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) + max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) + if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) + unit: pct + CPF Stall: + avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None)) + min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None)) + max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY + != 0) else None)) + unit: pct + CPF-L2 Utilization: + avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) + if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) + min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) + if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) + max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) + if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) + unit: pct + CPF-L2 Stall: + avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY + != 0) else None)) + min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY + != 0) else None)) + max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY + != 0) else None)) + unit: pct + CPF-UTCL1 Stall: + avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) + if (CPF_CPF_STAT_BUSY != 0) else None) + min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) + if (CPF_CPF_STAT_BUSY != 0) else None) + max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) + if (CPF_CPF_STAT_BUSY != 0) else None) + unit: pct + - metric_table: + id: 502 + title: Command processor packet processor (CPC) + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + gfx90a: + CPC Utilization: + avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) + if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) + min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) + if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) + max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) + if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) + unit: pct + CPC Stall Rate: + avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + unit: pct + CPC Packet Decoding Utilization: + avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if + (CPC_CPC_STAT_BUSY != 0) else None) + min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if + (CPC_CPC_STAT_BUSY != 0) else None) + max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if + (CPC_CPC_STAT_BUSY != 0) else None) + unit: pct + CPC-Workgroup Manager Utilization: + avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + unit: Pct + CPC-L2 Utilization: + avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) + if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) + min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) + if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) + max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) + if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) + unit: pct + CPC-UTCL1 Stall: + avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) + if (CPC_CPC_STAT_BUSY != 0) else None) + min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) + if (CPC_CPC_STAT_BUSY != 0) else None) + max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) + if (CPC_CPC_STAT_BUSY != 0) else None) + unit: pct + CPC-UTCL2 Utilization: + avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) + if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) + min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) + if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) + max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) + if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) + unit: pct + gfx941: + CPC Utilization: + avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) + if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) + min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) + if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) + max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) + if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) + unit: pct + CPC Stall Rate: + avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + unit: pct + CPC Packet Decoding Utilization: + avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if + (CPC_CPC_STAT_BUSY != 0) else None) + min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if + (CPC_CPC_STAT_BUSY != 0) else None) + max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if + (CPC_CPC_STAT_BUSY != 0) else None) + unit: pct + CPC-Workgroup Manager Utilization: + avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + unit: Pct + CPC-L2 Utilization: + avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) + if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) + min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) + if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) + max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) + if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) + unit: pct + CPC-UTCL1 Stall: + avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) + if (CPC_CPC_STAT_BUSY != 0) else None) + min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) + if (CPC_CPC_STAT_BUSY != 0) else None) + max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) + if (CPC_CPC_STAT_BUSY != 0) else None) + unit: pct + CPC-UTCL2 Utilization: + avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) + if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) + min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) + if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) + max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) + if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) + unit: pct + gfx940: + CPC Utilization: + avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) + if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) + min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) + if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) + max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) + if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) + unit: pct + CPC Stall Rate: + avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + unit: pct + CPC Packet Decoding Utilization: + avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if + (CPC_CPC_STAT_BUSY != 0) else None) + min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if + (CPC_CPC_STAT_BUSY != 0) else None) + max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if + (CPC_CPC_STAT_BUSY != 0) else None) + unit: pct + CPC-Workgroup Manager Utilization: + avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + unit: Pct + CPC-L2 Utilization: + avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) + if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) + min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) + if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) + max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) + if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) + unit: pct + CPC-UTCL1 Stall: + avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) + if (CPC_CPC_STAT_BUSY != 0) else None) + min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) + if (CPC_CPC_STAT_BUSY != 0) else None) + max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) + if (CPC_CPC_STAT_BUSY != 0) else None) + unit: pct + CPC-UTCL2 Utilization: + avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) + if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) + min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) + if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) + max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) + if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) + unit: pct + gfx942: + CPC Utilization: + avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) + if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) + min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) + if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) + max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) + if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) + unit: pct + CPC Stall Rate: + avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + unit: pct + CPC Packet Decoding Utilization: + avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if + (CPC_CPC_STAT_BUSY != 0) else None) + min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if + (CPC_CPC_STAT_BUSY != 0) else None) + max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if + (CPC_CPC_STAT_BUSY != 0) else None) + unit: pct + CPC-Workgroup Manager Utilization: + avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + unit: Pct + CPC-L2 Utilization: + avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) + if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) + min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) + if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) + max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) + if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) + unit: pct + CPC-UTCL1 Stall: + avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) + if (CPC_CPC_STAT_BUSY != 0) else None) + min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) + if (CPC_CPC_STAT_BUSY != 0) else None) + max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) + if (CPC_CPC_STAT_BUSY != 0) else None) + unit: pct + CPC-UTCL2 Utilization: + avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) + if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) + min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) + if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) + max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) + if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) + unit: pct + gfx950: + CPC SYNC FIFO Full Rate: + avg: AVG((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY + != 0) else None) + min: MIN((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY + != 0) else None) + max: MAX((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY + != 0) else None) + unit: pct + CPC CANE Stall Rate: + avg: AVG((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) + else None) + min: MIN((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) + else None) + max: MAX((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) + else None) + unit: pct + CPC ADC Utilization: + avg: AVG((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else + None) + min: MIN((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else + None) + max: MAX((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else + None) + unit: pct + CPC Utilization: + avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) + if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) + min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) + if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) + max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) + if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) + unit: pct + CPC Stall Rate: + avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + unit: pct + CPC Packet Decoding Utilization: + avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if + (CPC_CPC_STAT_BUSY != 0) else None) + min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if + (CPC_CPC_STAT_BUSY != 0) else None) + max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if + (CPC_CPC_STAT_BUSY != 0) else None) + unit: pct + CPC-Workgroup Manager Utilization: + avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + unit: Pct + CPC-L2 Utilization: + avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) + if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) + min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) + if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) + max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) + if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) + unit: pct + CPC-UTCL1 Stall: + avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) + if (CPC_CPC_STAT_BUSY != 0) else None) + min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) + if (CPC_CPC_STAT_BUSY != 0) else None) + max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) + if (CPC_CPC_STAT_BUSY != 0) else None) + unit: pct + CPC-UTCL2 Utilization: + avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) + if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) + min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) + if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) + max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) + if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) + unit: pct + gfx908: + CPC Utilization: + avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) + if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) + min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) + if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) + max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) + if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) + unit: pct + CPC Stall Rate: + avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY + != 0) else None)) + unit: pct + CPC Packet Decoding Utilization: + avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if + (CPC_CPC_STAT_BUSY != 0) else None) + min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if + (CPC_CPC_STAT_BUSY != 0) else None) + max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if + (CPC_CPC_STAT_BUSY != 0) else None) + unit: pct + CPC-Workgroup Manager Utilization: + avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY + != 0) else None) + unit: Pct + CPC-L2 Utilization: + avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) + if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) + min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) + if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) + max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) + if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) + unit: pct + CPC-UTCL1 Stall: + avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) + if (CPC_CPC_STAT_BUSY != 0) else None) + min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) + if (CPC_CPC_STAT_BUSY != 0) else None) + max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) + if (CPC_CPC_STAT_BUSY != 0) else None) + unit: pct + CPC-UTCL2 Utilization: + avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) + if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) + min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) + if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) + max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) + if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) + unit: pct + metrics_description: + CPF Utilization: + plain: Percent of total cycles where the CPF was busy actively doing any work. + The ratio of CPF busy cycles over total cycles counted by the CPF. + rst: Percent of total cycles where the CPF was busy actively doing any work. + The ratio of CPF busy cycles over total cycles counted by the CPF. + unit: Percent + CPF Stall: + plain: Percent of CPF busy cycles where the CPF was stalled for any reason. + rst: Percent of CPF busy cycles where the CPF was stalled for any reason. + unit: Percent + CPF-L2 Utilization: + plain: Percent of total cycles counted by the CPF-L2 interface where the CPF-L2 + interface was active doing any work. The ratio of CPF-L2 busy cycles over + total cycles counted by the CPF-L2. + rst: Percent of total cycles counted by the CPF-:doc:`L2 ` interface where + the CPF-L2 interface was active doing any work. The ratio of CPF-L2 busy + cycles over total cycles counted by the CPF-L2. + unit: Percent + CPF-L2 Stall: + plain: Percent of CPF-L2 L2 busy cycles where the CPF-L2 interface was stalled + for any reason. + rst: Percent of CPF-:doc:`L2 ` L2 busy cycles where the CPF-L2 interface + was stalled for any reason. + unit: Percent + CPF-UTCL1 Stall: + plain: Percent of CPF busy cycles where the CPF was stalled by address translation. + rst: Percent of CPF busy cycles where the CPF was stalled by address translation. + unit: Percent + CPC Utilization: + plain: Percent of total cycles where the CPC was busy actively doing any work. + The ratio of CPC busy cycles over total cycles counted by the CPC. + rst: Percent of total cycles where the CPC was busy actively doing any work. + The ratio of CPC busy cycles over total cycles counted by the CPC. + unit: Percent + CPC Stall Rate: + plain: Percent of CPC busy cycles where the CPC was stalled for any reason. + rst: Percent of CPC busy cycles where the CPC was stalled for any reason. + unit: Percent + CPC Packet Decoding Utilization: + plain: Percent of CPC busy cycles spent decoding commands for processing. + rst: Percent of CPC busy cycles spent decoding commands for processing. + unit: Percent + CPC-Workgroup Manager Utilization: + plain: Percent of CPC busy cycles spent dispatching workgroups to the workgroup + manager. + rst: Percent of CPC busy cycles spent dispatching workgroups to the :ref:`workgroup + manager `. + unit: Percent + CPC-L2 Utilization: + plain: Percent of total cycles counted by the CPC-L2 interface where the CPC-L2 + interface was active doing any work. + rst: Percent of total cycles counted by the CPC-:doc:`L2 ` interface where + the CPC-L2 interface was active doing any work. + unit: Percent + CPC-UTCL1 Stall: + plain: Percent of CPC busy cycles where the CPC was stalled by address translation + rst: Percent of CPC busy cycles where the CPC was stalled by address translation + unit: Percent + CPC-UTCL2 Utilization: + plain: 'Percent of total cycles counted by the CPC''s L2 address translation + interface where the CPC was busy doing address translation work. ' + rst: Percent of total cycles counted by the CPC's :doc:`L2 ` address translation + interface where the CPC was busy doing address translation work. + unit: Percent +- id: 600 + title: Workgroup Manager (SPI) + data source: + - metric_table: + id: 601 + title: Workgroup manager utilizations + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + gfx90a: + Accelerator Utilization: + avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) + min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) + max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) + unit: Pct + Scheduler-Pipe Utilization: + avg: AVG(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu + * $se_per_gpu)) + min: MIN(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu + * $se_per_gpu)) + max: MAX(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu + * $se_per_gpu)) + unit: Pct + Workgroup Manager Utilization: + avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) + min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) + max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) + unit: Pct + Shader Engine Utilization: + avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) + min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) + max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) + unit: Pct + SIMD Utilization: + avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Dispatched Workgroups: + avg: AVG(SPI_CSN_NUM_THREADGROUPS) + min: MIN(SPI_CSN_NUM_THREADGROUPS) + max: MAX(SPI_CSN_NUM_THREADGROUPS) + unit: Workgroups + Dispatched Wavefronts: + avg: AVG(SPI_CSN_WAVE) + min: MIN(SPI_CSN_WAVE) + max: MAX(SPI_CSN_WAVE) + unit: Wavefronts + VGPR Writes: + avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) + else None)) + min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) + else None)) + max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) + else None)) + unit: Cycles/wave + SGPR Writes: + avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) + else None)) + min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) + else None)) + max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) + else None)) + unit: Cycles/wave + gfx941: + Accelerator Utilization: + avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) + min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) + max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) + unit: Pct + Scheduler-Pipe Utilization: + avg: AVG(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu + * $se_per_gpu)) + min: MIN(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu + * $se_per_gpu)) + max: MAX(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu + * $se_per_gpu)) + unit: Pct + Workgroup Manager Utilization: + avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) + min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) + max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) + unit: Pct + Shader Engine Utilization: + avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) + min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) + max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) + unit: Pct + SIMD Utilization: + avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Dispatched Workgroups: + avg: AVG(SPI_CSN_NUM_THREADGROUPS) + min: MIN(SPI_CSN_NUM_THREADGROUPS) + max: MAX(SPI_CSN_NUM_THREADGROUPS) + unit: Workgroups + Dispatched Wavefronts: + avg: AVG(SPI_CSN_WAVE) + min: MIN(SPI_CSN_WAVE) + max: MAX(SPI_CSN_WAVE) + unit: Wavefronts + VGPR Writes: + avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) + else None)) + min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) + else None)) + max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) + else None)) + unit: Cycles/wave + SGPR Writes: + avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) + else None)) + min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) + else None)) + max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) + else None)) + unit: Cycles/wave + gfx940: + Accelerator Utilization: + avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) + min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) + max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) + unit: Pct + Scheduler-Pipe Utilization: + avg: AVG(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu + * $se_per_gpu)) + min: MIN(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu + * $se_per_gpu)) + max: MAX(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu + * $se_per_gpu)) + unit: Pct + Workgroup Manager Utilization: + avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) + min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) + max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) + unit: Pct + Shader Engine Utilization: + avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) + min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) + max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) + unit: Pct + SIMD Utilization: + avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Dispatched Workgroups: + avg: AVG(SPI_CSN_NUM_THREADGROUPS) + min: MIN(SPI_CSN_NUM_THREADGROUPS) + max: MAX(SPI_CSN_NUM_THREADGROUPS) + unit: Workgroups + Dispatched Wavefronts: + avg: AVG(SPI_CSN_WAVE) + min: MIN(SPI_CSN_WAVE) + max: MAX(SPI_CSN_WAVE) + unit: Wavefronts + VGPR Writes: + avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) + else None)) + min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) + else None)) + max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) + else None)) + unit: Cycles/wave + SGPR Writes: + avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) + else None)) + min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) + else None)) + max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) + else None)) + unit: Cycles/wave + gfx942: + Accelerator Utilization: + avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) + min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) + max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) + unit: Pct + Scheduler-Pipe Utilization: + avg: AVG(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu + * $se_per_gpu)) + min: MIN(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu + * $se_per_gpu)) + max: MAX(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu + * $se_per_gpu)) + unit: Pct + Workgroup Manager Utilization: + avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) + min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) + max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) + unit: Pct + Shader Engine Utilization: + avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) + min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) + max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) + unit: Pct + SIMD Utilization: + avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Dispatched Workgroups: + avg: AVG(SPI_CSN_NUM_THREADGROUPS) + min: MIN(SPI_CSN_NUM_THREADGROUPS) + max: MAX(SPI_CSN_NUM_THREADGROUPS) + unit: Workgroups + Dispatched Wavefronts: + avg: AVG(SPI_CSN_WAVE) + min: MIN(SPI_CSN_WAVE) + max: MAX(SPI_CSN_WAVE) + unit: Wavefronts + VGPR Writes: + avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) + else None)) + min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) + else None)) + max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) + else None)) + unit: Cycles/wave + SGPR Writes: + avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) + else None)) + min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) + else None)) + max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) + else None)) + unit: Cycles/wave + gfx950: + Schedule-Pipe Wave Occupancy: + avg: AVG(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY + + SPI_CSQ_P3_OCCUPANCY) + min: MIN(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY + + SPI_CSQ_P3_OCCUPANCY) + max: MAX(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY + + SPI_CSQ_P3_OCCUPANCY) + unit: Wave + Accelerator Utilization: + avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) + min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) + max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) + unit: Pct + Scheduler-Pipe Utilization: + avg: AVG(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) + / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) + min: MIN(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) + / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) + max: MAX(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) + / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) + unit: Pct + Scheduler-Pipe Wave Utilization: + avg: AVG(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu + * $se_per_gpu)) + min: MIN(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu + * $se_per_gpu)) + max: MAX(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu + * $se_per_gpu)) + unit: Pct + Workgroup Manager Utilization: + avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) + min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) + max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) + unit: Pct + Shader Engine Utilization: + avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) + min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) + max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) + unit: Pct + SIMD Utilization: + avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Dispatched Workgroups: + avg: AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + + SPI_CS3_NUM_THREADGROUPS) + min: MIN(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + + SPI_CS3_NUM_THREADGROUPS) + max: MAX(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + + SPI_CS3_NUM_THREADGROUPS) + unit: Workgroups + Dispatched Wavefronts: + avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) + min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) + max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) + unit: Wavefronts + VGPR Writes: + avg: AVG((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) + min: MIN((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) + max: MAX((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) + unit: Cycles/wave + SGPR Writes: + avg: AVG((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) + != 0) else None)) + min: MIN((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) + != 0) else None)) + max: MAX((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) + != 0) else None)) + unit: Cycles/wave + gfx908: + Accelerator Utilization: + avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) + min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) + max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) + unit: Pct + Scheduler-Pipe Utilization: + avg: AVG(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu + * $se_per_gpu)) + min: MIN(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu + * $se_per_gpu)) + max: MAX(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu + * $se_per_gpu)) + unit: Pct + Workgroup Manager Utilization: + avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) + min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) + max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) + unit: Pct + Shader Engine Utilization: + avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) + min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) + max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) + unit: Pct + SIMD Utilization: + avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Dispatched Workgroups: + avg: AVG(SPI_CSN_NUM_THREADGROUPS) + min: MIN(SPI_CSN_NUM_THREADGROUPS) + max: MAX(SPI_CSN_NUM_THREADGROUPS) + unit: Workgroups + Dispatched Wavefronts: + avg: AVG(SPI_CSN_WAVE) + min: MIN(SPI_CSN_WAVE) + max: MAX(SPI_CSN_WAVE) + unit: Wavefronts + VGPR Writes: + avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) + else None)) + min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) + else None)) + max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) + else None)) + unit: Cycles/wave + SGPR Writes: + avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) + else None)) + min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) + else None)) + max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) + else None)) + unit: Cycles/wave + - metric_table: + id: 602 + title: Workgroup Manager - Resource Allocation + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + gfx90a: + Not-scheduled Rate (Workgroup Manager): + avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + unit: Pct + Not-scheduled Rate (Scheduler-Pipe): + avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + unit: Pct + Scheduler-Pipe Stall Rate: + avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) + min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) + max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) + unit: Pct + Scratch Stall Rate: + avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + unit: Pct + Insufficient SIMD Waveslots: + avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + unit: Pct + Insufficient SIMD VGPRs: + avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + unit: Pct + Insufficient SIMD SGPRs: + avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + unit: Pct + Insufficient CU LDS: + avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Insufficient CU Barriers: + avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Reached CU Workgroup Limit: + avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + unit: Pct + Reached CU Wavefront Limit: + avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + gfx941: + Not-scheduled Rate (Workgroup Manager): + avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + unit: Pct + Not-scheduled Rate (Scheduler-Pipe): + avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + unit: Pct + Scheduler-Pipe Stall Rate: + avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) + min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) + max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) + unit: Pct + Scratch Stall Rate: + avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + unit: Pct + Insufficient SIMD Waveslots: + avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + unit: Pct + Insufficient SIMD VGPRs: + avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + unit: Pct + Insufficient SIMD SGPRs: + avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + unit: Pct + Insufficient CU LDS: + avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Insufficient CU Barriers: + avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Reached CU Workgroup Limit: + avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + unit: Pct + Reached CU Wavefront Limit: + avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + gfx940: + Not-scheduled Rate (Workgroup Manager): + avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + unit: Pct + Not-scheduled Rate (Scheduler-Pipe): + avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + unit: Pct + Scheduler-Pipe Stall Rate: + avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) + min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) + max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) + unit: Pct + Scratch Stall Rate: + avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + unit: Pct + Insufficient SIMD Waveslots: + avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + unit: Pct + Insufficient SIMD VGPRs: + avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + unit: Pct + Insufficient SIMD SGPRs: + avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + unit: Pct + Insufficient CU LDS: + avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Insufficient CU Barriers: + avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Reached CU Workgroup Limit: + avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + unit: Pct + Reached CU Wavefront Limit: + avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + gfx942: + Not-scheduled Rate (Workgroup Manager): + avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + unit: Pct + Not-scheduled Rate (Scheduler-Pipe): + avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + unit: Pct + Scheduler-Pipe Stall Rate: + avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) + min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) + max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) + unit: Pct + Scratch Stall Rate: + avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + unit: Pct + Insufficient SIMD Waveslots: + avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + unit: Pct + Insufficient SIMD VGPRs: + avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + unit: Pct + Insufficient SIMD SGPRs: + avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + unit: Pct + Insufficient CU LDS: + avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Insufficient CU Barriers: + avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Reached CU Workgroup Limit: + avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + unit: Pct + Reached CU Wavefront Limit: + avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + gfx950: + Not-scheduled Rate (Workgroup Manager): + avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + unit: Pct + Not-scheduled Rate (Scheduler-Pipe): + avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + unit: Pct + Scheduler-Pipe FIFO Full Rate: + avg: AVG((100 * (SPI_CS0_CRAWLER_STALL + SPI_CS1_CRAWLER_STALL + SPI_CS2_CRAWLER_STALL + + SPI_CS3_CRAWLER_STALL) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if + ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + min: MIN((100 * (SPI_CS0_CRAWLER_STALL + SPI_CS1_CRAWLER_STALL + SPI_CS2_CRAWLER_STALL + + SPI_CS3_CRAWLER_STALL) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if + ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + max: MAX((100 * (SPI_CS0_CRAWLER_STALL + SPI_CS1_CRAWLER_STALL + SPI_CS2_CRAWLER_STALL + + SPI_CS3_CRAWLER_STALL) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if + ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + unit: Pct + Scheduler-Pipe Stall Rate: + avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) + min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) + max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) + unit: Pct + Scratch Stall Rate: + avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + unit: Pct + Insufficient SIMD Waveslots: + avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + unit: Pct + Insufficient SIMD VGPRs: + avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + unit: Pct + Insufficient SIMD SGPRs: + avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + unit: Pct + Insufficient CU LDS: + avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Insufficient CU Barriers: + avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Reached CU Workgroup Limit: + avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + unit: Pct + Reached CU Wavefront Limit: + avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + gfx908: + Not-scheduled Rate (Workgroup Manager): + avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + unit: Pct + Not-scheduled Rate (Scheduler-Pipe): + avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + unit: Pct + Scheduler-Pipe Stall Rate: + avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) + min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) + max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) + unit: Pct + Scratch Stall Rate: + avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) + if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) + unit: Pct + Insufficient SIMD Waveslots: + avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + unit: Pct + Insufficient SIMD VGPRs: + avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + unit: Pct + Insufficient SIMD SGPRs: + avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + unit: Pct + Insufficient CU LDS: + avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Insufficient CU Barriers: + avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + Reached CU Workgroup Limit: + avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * + $cu_per_gpu)) + unit: Pct + Reached CU Wavefront Limit: + avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + metrics_description: + Accelerator Utilization: + plain: The percent of cycles in the kernel where the accelerator was actively + doing any work. + rst: The percent of cycles in the kernel where the accelerator was actively + doing any work. + unit: Percent + Scheduler-Pipe Utilization: + plain: The percent of total scheduler-pipe cycles in the kernel where the scheduler-pipes + were actively doing any work. + rst: 'The percent of :ref:`total scheduler-pipe cycles ` + in the kernel where the scheduler-pipes were actively doing any work. Note: this + value is expected to range between 0% and 25%. See :ref:`desc-spi`.' + unit: Percent + Workgroup Manager Utilization: + plain: The percent of cycles in the kernel where the workgroup manager was actively + doing any work. + rst: The percent of cycles in the kernel where the workgroup manager was actively + doing any work. + unit: Percent + Shader Engine Utilization: + plain: The percent of total shader engine cycles in the kernel where any CU + in a shader-engine was actively doing any work, normalized over all shader-engines. + Low values (e.g., << 100%) indicate that the accelerator was not fully saturated + by the kernel, or a potential load-imbalance issue. + rst: The percent of :ref:`total shader engine cycles ` in the kernel + where any CU in a shader-engine was actively doing any work, normalized over + all shader-engines. Low values (e.g., << 100%) indicate that the accelerator + was not fully saturated by the kernel, or a potential load-imbalance issue. + unit: Percent + SIMD Utilization: + plain: The percent of total SIMD cycles in the kernel where any SIMD on a CU + was actively doing any work, summed over all CUs. Low values (less than 100%) + indicate that the accelerator was not fully saturated by the kernel, or a + potential load-imbalance issue. + rst: The percent of :ref:`total SIMD cycles ` in the kernel where + any :ref:`SIMD ` on a CU was actively doing any work, summed over + all CUs. Low values (less than 100%) indicate that the accelerator was not + fully saturated by the kernel, or a potential load-imbalance issue. + unit: Percent + Dispatched Workgroups: + plain: The total number of workgroups forming this kernel launch. + rst: The total number of workgroups forming this kernel launch. + unit: Workgroups + Dispatched Wavefronts: + plain: The total number of wavefronts, summed over all workgroups, forming this + kernel launch. + rst: The total number of wavefronts, summed over all workgroups, forming this + kernel launch. + unit: Wavefronts + VGPR Writes: + plain: The average number of cycles spent initializing VGPRs at wave creation. + rst: The average number of cycles spent initializing :ref:`VGPRs ` at + wave creation. + unit: Cycles/wave + SGPR Writes: + plain: The average number of cycles spent initializing SGPRs at wave creation. + rst: The average number of cycles spent initializing :ref:`SGPRs ` at + wave creation. + unit: Cycles/wave + Not-scheduled Rate (Workgroup Manager): + plain: The percent of total scheduler-pipe cycles in the kernel where a workgroup + could not be scheduled to a CU due to a bottleneck within the workgroup manager + rather than a lack of a CU or SIMD with sufficient resources. + rst: 'The percent of :ref:`total scheduler-pipe cycles ` + in the kernel where a workgroup could not be scheduled to a :doc:`CU ` + due to a bottleneck within the workgroup manager rather than a lack of a + CU or :ref:`SIMD ` with sufficient resources. Note: this value + is expected to range between 0-25%. See note in :ref:`workgroup manager ` + description.' + unit: Percent + Not-scheduled Rate (Scheduler-Pipe): + plain: 'The percent of total scheduler-pipe cycles in the kernel where a workgroup + could not be scheduled to a CU due to a bottleneck within the scheduler-pipes + rather than a lack of a CU or SIMD with sufficient resources. ' + rst: 'The percent of :ref:`total scheduler-pipe cycles ` + in the kernel where a workgroup could not be scheduled to a :doc:`CU ` + due to a bottleneck within the scheduler-pipes rather than a lack of a CU + or :ref:`SIMD ` with sufficient resources. Note: this value is + expected to range between 0-25%, see note in :ref:`workgroup manager ` + description.' + unit: Percent + Scheduler-Pipe Stall Rate: + plain: The percent of total scheduler-pipe cycles in the kernel where a workgroup + could not be scheduled to a CU due to occupancy limitations (like a lack of + a CU or SIMD with sufficient resources). + rst: 'The percent of :ref:`total scheduler-pipe cycles ` + in the kernel where a workgroup could not be scheduled to a :doc:`CU ` + due to occupancy limitations (like a lack of a CU or :ref:`SIMD ` + with sufficient resources). Note: this value is expected to range between + 0-25%, see note in :ref:`workgroup manager ` description.' + unit: Percent + Scratch Stall Rate: + plain: The percent of total shader-engine cycles in the kernel where a workgroup + could not be scheduled to a CU due to lack of private (a.k.a., scratch) memory + slots. While this can reach up to 100%, note that the actual occupancy limitations + on a kernel using private memory are typically quite small (for example, less + than 1% of the total number of waves that can be scheduled to an accelerator). + rst: The percent of :ref:`total shader-engine cycles ` in the kernel + where a workgroup could not be scheduled to a :doc:`CU ` due + to lack of :ref:`private (a.k.a., scratch) memory ` slots. While + this can reach up to 100%, note that the actual occupancy limitations on + a kernel using private memory are typically quite small (for example, less than + 1% of the total number of waves that can be scheduled to an accelerator). + unit: Percent + Insufficient SIMD Waveslots: + plain: The percent of total SIMD cycles in the kernel where a workgroup could + not be scheduled to a SIMD due to lack of available waveslots. + rst: The percent of :ref:`total SIMD cycles ` in the kernel where + a workgroup could not be scheduled to a :ref:`SIMD ` due to lack + of available :ref:`waveslots `. + unit: Percent + Insufficient SIMD VGPRs: + plain: The percent of total SIMD cycles in the kernel where a workgroup could + not be scheduled to a SIMD due to lack of available VGPRs. + rst: The percent of :ref:`total SIMD cycles ` in the kernel where + a workgroup could not be scheduled to a :ref:`SIMD ` due to lack + of available :ref:`VGPRs `. + unit: Percent + Insufficient SIMD SGPRs: + plain: The percent of total SIMD cycles in the kernel where a workgroup could + not be scheduled to a SIMD due to lack of available SGPRs. + rst: The percent of :ref:`total SIMD cycles ` in the kernel where + a workgroup could not be scheduled to a :ref:`SIMD ` due to lack + of available :ref:`SGPRs `. + unit: Percent + Insufficient CU LDS: + plain: The percent of total CU cycles in the kernel where a workgroup could + not be scheduled to a CU due to lack of available LDS. + rst: The percent of :ref:`total CU cycles ` in the kernel where + a workgroup could not be scheduled to a :doc:`CU ` due to lack + of available :doc:`LDS `. + unit: Percent + Insufficient CU Barriers: + plain: The percent of total CU cycles in the kernel where a workgroup could + not be scheduled to a CU due to lack of available barriers. + rst: The percent of :ref:`total CU cycles ` in the kernel where + a workgroup could not be scheduled to a :doc:`CU ` due to lack + of available :ref:`barriers `. + unit: Percent + Reached CU Workgroup Limit: + plain: The percent of total CU cycles in the kernel where a workgroup could + not be scheduled to a CU due to limits within the workgroup manager. This + is expected to be always be zero on CDNA2 or newer accelerators (and small + for previous accelerators). + rst: The percent of :ref:`total CU cycles ` in the kernel where + a workgroup could not be scheduled to a :doc:`CU ` due to limits + within the workgroup manager. This is expected to be always be zero on CDNA2 + or newer accelerators (and small for previous accelerators). + unit: Percent + Reached CU Wavefront Limit: + plain: The percent of total CU cycles in the kernel where a wavefront could + not be scheduled to a CU due to limits within the workgroup manager. This + is expected to be always be zero on CDNA2 or newer accelerators (and small + for previous accelerators). + rst: The percent of :ref:`total CU cycles ` in the kernel where + a wavefront could not be scheduled to a :doc:`CU ` due to limits + within the workgroup manager. This is expected to be always be zero on CDNA2 + or newer accelerators (and small for previous accelerators). + unit: Percent +- id: 700 + title: Wavefront + data source: + - metric_table: + id: 701 + title: Wavefront Launch Stats + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + gfx90a: + Grid Size: + avg: AVG(Grid_Size) + min: MIN(Grid_Size) + max: MAX(Grid_Size) + unit: Work Items + Workgroup Size: + avg: AVG(Workgroup_Size) + min: MIN(Workgroup_Size) + max: MAX(Workgroup_Size) + unit: Work Items + Total Wavefronts: + avg: AVG(SPI_CSN_WAVE) + min: MIN(SPI_CSN_WAVE) + max: MAX(SPI_CSN_WAVE) + unit: Wavefronts + Saved Wavefronts: + avg: AVG(SQ_WAVES_SAVED) + min: MIN(SQ_WAVES_SAVED) + max: MAX(SQ_WAVES_SAVED) + unit: Wavefronts + Restored Wavefronts: + avg: AVG(SQ_WAVES_RESTORED) + min: MIN(SQ_WAVES_RESTORED) + max: MAX(SQ_WAVES_RESTORED) + unit: Wavefronts + VGPRs: + avg: AVG(Arch_VGPR) + min: MIN(Arch_VGPR) + max: MAX(Arch_VGPR) + unit: Registers + AGPRs: + avg: AVG(Accum_VGPR) + min: MIN(Accum_VGPR) + max: MAX(Accum_VGPR) + unit: Registers + SGPRs: + avg: AVG(SGPR) + min: MIN(SGPR) + max: MAX(SGPR) + unit: Registers + LDS Allocation: + avg: AVG(LDS_Per_Workgroup) + min: MIN(LDS_Per_Workgroup) + max: MAX(LDS_Per_Workgroup) + unit: Bytes + Scratch Allocation: + avg: AVG(Scratch_Per_Workitem) + min: MIN(Scratch_Per_Workitem) + max: MAX(Scratch_Per_Workitem) + unit: Bytes/Workitem + gfx941: + Grid Size: + avg: AVG(Grid_Size) + min: MIN(Grid_Size) + max: MAX(Grid_Size) + unit: Work Items + Workgroup Size: + avg: AVG(Workgroup_Size) + min: MIN(Workgroup_Size) + max: MAX(Workgroup_Size) + unit: Work Items + Total Wavefronts: + avg: AVG(SPI_CSN_WAVE) + min: MIN(SPI_CSN_WAVE) + max: MAX(SPI_CSN_WAVE) + unit: Wavefronts + Saved Wavefronts: + avg: AVG(SQ_WAVES_SAVED) + min: MIN(SQ_WAVES_SAVED) + max: MAX(SQ_WAVES_SAVED) + unit: Wavefronts + Restored Wavefronts: + avg: AVG(SQ_WAVES_RESTORED) + min: MIN(SQ_WAVES_RESTORED) + max: MAX(SQ_WAVES_RESTORED) + unit: Wavefronts + VGPRs: + avg: AVG(Arch_VGPR) + min: MIN(Arch_VGPR) + max: MAX(Arch_VGPR) + unit: Registers + AGPRs: + avg: AVG(Accum_VGPR) + min: MIN(Accum_VGPR) + max: MAX(Accum_VGPR) + unit: Registers + SGPRs: + avg: AVG(SGPR) + min: MIN(SGPR) + max: MAX(SGPR) + unit: Registers + LDS Allocation: + avg: AVG(LDS_Per_Workgroup) + min: MIN(LDS_Per_Workgroup) + max: MAX(LDS_Per_Workgroup) + unit: Bytes + Scratch Allocation: + avg: AVG(Scratch_Per_Workitem) + min: MIN(Scratch_Per_Workitem) + max: MAX(Scratch_Per_Workitem) + unit: Bytes/Workitem + gfx940: + Grid Size: + avg: AVG(Grid_Size) + min: MIN(Grid_Size) + max: MAX(Grid_Size) + unit: Work Items + Workgroup Size: + avg: AVG(Workgroup_Size) + min: MIN(Workgroup_Size) + max: MAX(Workgroup_Size) + unit: Work Items + Total Wavefronts: + avg: AVG(SPI_CSN_WAVE) + min: MIN(SPI_CSN_WAVE) + max: MAX(SPI_CSN_WAVE) + unit: Wavefronts + Saved Wavefronts: + avg: AVG(SQ_WAVES_SAVED) + min: MIN(SQ_WAVES_SAVED) + max: MAX(SQ_WAVES_SAVED) + unit: Wavefronts + Restored Wavefronts: + avg: AVG(SQ_WAVES_RESTORED) + min: MIN(SQ_WAVES_RESTORED) + max: MAX(SQ_WAVES_RESTORED) + unit: Wavefronts + VGPRs: + avg: AVG(Arch_VGPR) + min: MIN(Arch_VGPR) + max: MAX(Arch_VGPR) + unit: Registers + AGPRs: + avg: AVG(Accum_VGPR) + min: MIN(Accum_VGPR) + max: MAX(Accum_VGPR) + unit: Registers + SGPRs: + avg: AVG(SGPR) + min: MIN(SGPR) + max: MAX(SGPR) + unit: Registers + LDS Allocation: + avg: AVG(LDS_Per_Workgroup) + min: MIN(LDS_Per_Workgroup) + max: MAX(LDS_Per_Workgroup) + unit: Bytes + Scratch Allocation: + avg: AVG(Scratch_Per_Workitem) + min: MIN(Scratch_Per_Workitem) + max: MAX(Scratch_Per_Workitem) + unit: Bytes/Workitem + gfx942: + Grid Size: + avg: AVG(Grid_Size) + min: MIN(Grid_Size) + max: MAX(Grid_Size) + unit: Work Items + Workgroup Size: + avg: AVG(Workgroup_Size) + min: MIN(Workgroup_Size) + max: MAX(Workgroup_Size) + unit: Work Items + Total Wavefronts: + avg: AVG(SPI_CSN_WAVE) + min: MIN(SPI_CSN_WAVE) + max: MAX(SPI_CSN_WAVE) + unit: Wavefronts + Saved Wavefronts: + avg: AVG(SQ_WAVES_SAVED) + min: MIN(SQ_WAVES_SAVED) + max: MAX(SQ_WAVES_SAVED) + unit: Wavefronts + Restored Wavefronts: + avg: AVG(SQ_WAVES_RESTORED) + min: MIN(SQ_WAVES_RESTORED) + max: MAX(SQ_WAVES_RESTORED) + unit: Wavefronts + VGPRs: + avg: AVG(Arch_VGPR) + min: MIN(Arch_VGPR) + max: MAX(Arch_VGPR) + unit: Registers + AGPRs: + avg: AVG(Accum_VGPR) + min: MIN(Accum_VGPR) + max: MAX(Accum_VGPR) + unit: Registers + SGPRs: + avg: AVG(SGPR) + min: MIN(SGPR) + max: MAX(SGPR) + unit: Registers + LDS Allocation: + avg: AVG(LDS_Per_Workgroup) + min: MIN(LDS_Per_Workgroup) + max: MAX(LDS_Per_Workgroup) + unit: Bytes + Scratch Allocation: + avg: AVG(Scratch_Per_Workitem) + min: MIN(Scratch_Per_Workitem) + max: MAX(Scratch_Per_Workitem) + unit: Bytes/Workitem + gfx950: + Grid Size: + avg: AVG(Grid_Size) + min: MIN(Grid_Size) + max: MAX(Grid_Size) + unit: Work Items + Workgroup Size: + avg: AVG(Workgroup_Size) + min: MIN(Workgroup_Size) + max: MAX(Workgroup_Size) + unit: Work Items + Total Wavefronts: + avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) + min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) + max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) + unit: Wavefronts + Saved Wavefronts: + avg: AVG(SQ_WAVES_SAVED) + min: MIN(SQ_WAVES_SAVED) + max: MAX(SQ_WAVES_SAVED) + unit: Wavefronts + Restored Wavefronts: + avg: AVG(SQ_WAVES_RESTORED) + min: MIN(SQ_WAVES_RESTORED) + max: MAX(SQ_WAVES_RESTORED) + unit: Wavefronts + VGPRs: + avg: AVG(Arch_VGPR) + min: MIN(Arch_VGPR) + max: MAX(Arch_VGPR) + unit: Registers + AGPRs: + avg: AVG(Accum_VGPR) + min: MIN(Accum_VGPR) + max: MAX(Accum_VGPR) + unit: Registers + SGPRs: + avg: AVG(SGPR) + min: MIN(SGPR) + max: MAX(SGPR) + unit: Registers + LDS Allocation: + avg: AVG(LDS_Per_Workgroup) + min: MIN(LDS_Per_Workgroup) + max: MAX(LDS_Per_Workgroup) + unit: Bytes + Scratch Allocation: + avg: AVG(Scratch_Per_Workitem) + min: MIN(Scratch_Per_Workitem) + max: MAX(Scratch_Per_Workitem) + unit: Bytes/Workitem + gfx908: + Grid Size: + avg: AVG(Grid_Size) + min: MIN(Grid_Size) + max: MAX(Grid_Size) + unit: Work Items + Workgroup Size: + avg: AVG(Workgroup_Size) + min: MIN(Workgroup_Size) + max: MAX(Workgroup_Size) + unit: Work Items + Total Wavefronts: + avg: AVG(SPI_CSN_WAVE) + min: MIN(SPI_CSN_WAVE) + max: MAX(SPI_CSN_WAVE) + unit: Wavefronts + Saved Wavefronts: + avg: AVG(SQ_WAVES_SAVED) + min: MIN(SQ_WAVES_SAVED) + max: MAX(SQ_WAVES_SAVED) + unit: Wavefronts + Restored Wavefronts: + avg: AVG(SQ_WAVES_RESTORED) + min: MIN(SQ_WAVES_RESTORED) + max: MAX(SQ_WAVES_RESTORED) + unit: Wavefronts + VGPRs: + avg: AVG(Arch_VGPR) + min: MIN(Arch_VGPR) + max: MAX(Arch_VGPR) + unit: Registers + AGPRs: + avg: AVG(Accum_VGPR) + min: MIN(Accum_VGPR) + max: MAX(Accum_VGPR) + unit: Registers + SGPRs: + avg: AVG(SGPR) + min: MIN(SGPR) + max: MAX(SGPR) + unit: Registers + LDS Allocation: + avg: AVG(LDS_Per_Workgroup) + min: MIN(LDS_Per_Workgroup) + max: MAX(LDS_Per_Workgroup) + unit: Bytes + Scratch Allocation: + avg: AVG(Scratch_Per_Workitem) + min: MIN(Scratch_Per_Workitem) + max: MAX(Scratch_Per_Workitem) + unit: Bytes/Workitem + - metric_table: + id: 702 + title: Wavefront Runtime Stats + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + gfx90a: + Kernel Time: + avg: AVG((End_Timestamp - Start_Timestamp)) + min: MIN((End_Timestamp - Start_Timestamp)) + max: MAX((End_Timestamp - Start_Timestamp)) + unit: ns + Kernel Time (Cycles): + avg: AVG($GRBM_GUI_ACTIVE_PER_XCD) + min: MIN($GRBM_GUI_ACTIVE_PER_XCD) + max: MAX($GRBM_GUI_ACTIVE_PER_XCD) + unit: Cycle + Instructions per wavefront: + avg: AVG((SQ_INSTS / SQ_WAVES)) + min: MIN((SQ_INSTS / SQ_WAVES)) + max: MAX((SQ_INSTS / SQ_WAVES)) + unit: Instr/wavefront + Wave Cycles: + avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom)) + min: MIN(((4 * SQ_WAVE_CYCLES) / $denom)) + max: MAX(((4 * SQ_WAVE_CYCLES) / $denom)) + unit: (Cycles + $normUnit) + Dependency Wait Cycles: + avg: AVG(((4 * SQ_WAIT_ANY) / $denom)) + min: MIN(((4 * SQ_WAIT_ANY) / $denom)) + max: MAX(((4 * SQ_WAIT_ANY) / $denom)) + unit: (Cycles + $normUnit) + Issue Wait Cycles: + avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom)) + min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom)) + max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom)) + unit: (Cycles + $normUnit) + Active Cycles: + avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom)) + min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom)) + max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom)) + unit: (Cycles + $normUnit) + Wavefront Occupancy: + avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + unit: Wavefronts + coll_level: SQ_LEVEL_WAVES + gfx941: + Kernel Time: + avg: AVG((End_Timestamp - Start_Timestamp)) + min: MIN((End_Timestamp - Start_Timestamp)) + max: MAX((End_Timestamp - Start_Timestamp)) + unit: ns + Kernel Time (Cycles): + avg: AVG($GRBM_GUI_ACTIVE_PER_XCD) + min: MIN($GRBM_GUI_ACTIVE_PER_XCD) + max: MAX($GRBM_GUI_ACTIVE_PER_XCD) + unit: Cycle + Instructions per wavefront: + avg: AVG((SQ_INSTS / SQ_WAVES)) + min: MIN((SQ_INSTS / SQ_WAVES)) + max: MAX((SQ_INSTS / SQ_WAVES)) + unit: Instr/wavefront + Wave Cycles: + avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom)) + min: MIN(((4 * SQ_WAVE_CYCLES) / $denom)) + max: MAX(((4 * SQ_WAVE_CYCLES) / $denom)) + unit: (Cycles + $normUnit) + Dependency Wait Cycles: + avg: AVG(((4 * SQ_WAIT_ANY) / $denom)) + min: MIN(((4 * SQ_WAIT_ANY) / $denom)) + max: MAX(((4 * SQ_WAIT_ANY) / $denom)) + unit: (Cycles + $normUnit) + Issue Wait Cycles: + avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom)) + min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom)) + max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom)) + unit: (Cycles + $normUnit) + Active Cycles: + avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom)) + min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom)) + max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom)) + unit: (Cycles + $normUnit) + Wavefront Occupancy: + avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + unit: Wavefronts + coll_level: SQ_LEVEL_WAVES + gfx940: + Kernel Time: + avg: AVG((End_Timestamp - Start_Timestamp)) + min: MIN((End_Timestamp - Start_Timestamp)) + max: MAX((End_Timestamp - Start_Timestamp)) + unit: ns + Kernel Time (Cycles): + avg: AVG($GRBM_GUI_ACTIVE_PER_XCD) + min: MIN($GRBM_GUI_ACTIVE_PER_XCD) + max: MAX($GRBM_GUI_ACTIVE_PER_XCD) + unit: Cycle + Instructions per wavefront: + avg: AVG((SQ_INSTS / SQ_WAVES)) + min: MIN((SQ_INSTS / SQ_WAVES)) + max: MAX((SQ_INSTS / SQ_WAVES)) + unit: Instr/wavefront + Wave Cycles: + avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom)) + min: MIN(((4 * SQ_WAVE_CYCLES) / $denom)) + max: MAX(((4 * SQ_WAVE_CYCLES) / $denom)) + unit: (Cycles + $normUnit) + Dependency Wait Cycles: + avg: AVG(((4 * SQ_WAIT_ANY) / $denom)) + min: MIN(((4 * SQ_WAIT_ANY) / $denom)) + max: MAX(((4 * SQ_WAIT_ANY) / $denom)) + unit: (Cycles + $normUnit) + Issue Wait Cycles: + avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom)) + min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom)) + max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom)) + unit: (Cycles + $normUnit) + Active Cycles: + avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom)) + min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom)) + max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom)) + unit: (Cycles + $normUnit) + Wavefront Occupancy: + avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + unit: Wavefronts + coll_level: SQ_LEVEL_WAVES + gfx942: + Kernel Time: + avg: AVG((End_Timestamp - Start_Timestamp)) + min: MIN((End_Timestamp - Start_Timestamp)) + max: MAX((End_Timestamp - Start_Timestamp)) + unit: ns + Kernel Time (Cycles): + avg: AVG($GRBM_GUI_ACTIVE_PER_XCD) + min: MIN($GRBM_GUI_ACTIVE_PER_XCD) + max: MAX($GRBM_GUI_ACTIVE_PER_XCD) + unit: Cycle + Instructions per wavefront: + avg: AVG((SQ_INSTS / SQ_WAVES)) + min: MIN((SQ_INSTS / SQ_WAVES)) + max: MAX((SQ_INSTS / SQ_WAVES)) + unit: Instr/wavefront + Wave Cycles: + avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom)) + min: MIN(((4 * SQ_WAVE_CYCLES) / $denom)) + max: MAX(((4 * SQ_WAVE_CYCLES) / $denom)) + unit: (Cycles + $normUnit) + Dependency Wait Cycles: + avg: AVG(((4 * SQ_WAIT_ANY) / $denom)) + min: MIN(((4 * SQ_WAIT_ANY) / $denom)) + max: MAX(((4 * SQ_WAIT_ANY) / $denom)) + unit: (Cycles + $normUnit) + Issue Wait Cycles: + avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom)) + min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom)) + max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom)) + unit: (Cycles + $normUnit) + Active Cycles: + avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom)) + min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom)) + max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom)) + unit: (Cycles + $normUnit) + Wavefront Occupancy: + avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + unit: Wavefronts + coll_level: SQ_LEVEL_WAVES + gfx950: + Kernel Time: + avg: AVG((End_Timestamp - Start_Timestamp)) + min: MIN((End_Timestamp - Start_Timestamp)) + max: MAX((End_Timestamp - Start_Timestamp)) + unit: ns + Kernel Time (Cycles): + avg: AVG($GRBM_GUI_ACTIVE_PER_XCD) + min: MIN($GRBM_GUI_ACTIVE_PER_XCD) + max: MAX($GRBM_GUI_ACTIVE_PER_XCD) + unit: Cycle + Instructions per wavefront: + avg: AVG((SQ_INSTS / SQ_WAVES)) + min: MIN((SQ_INSTS / SQ_WAVES)) + max: MAX((SQ_INSTS / SQ_WAVES)) + unit: Instr/wavefront + Wave Cycles: + avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom)) + min: MIN(((4 * SQ_WAVE_CYCLES) / $denom)) + max: MAX(((4 * SQ_WAVE_CYCLES) / $denom)) + unit: (Cycles + $normUnit) + Dependency Wait Cycles: + avg: AVG(((4 * SQ_WAIT_ANY) / $denom)) + min: MIN(((4 * SQ_WAIT_ANY) / $denom)) + max: MAX(((4 * SQ_WAIT_ANY) / $denom)) + unit: (Cycles + $normUnit) + Issue Wait Cycles: + avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom)) + min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom)) + max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom)) + unit: (Cycles + $normUnit) + Active Cycles: + avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom)) + min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom)) + max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom)) + unit: (Cycles + $normUnit) + Wavefront Occupancy: + avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + unit: Wavefronts + coll_level: SQ_LEVEL_WAVES + gfx908: + Kernel Time: + avg: AVG((End_Timestamp - Start_Timestamp)) + min: MIN((End_Timestamp - Start_Timestamp)) + max: MAX((End_Timestamp - Start_Timestamp)) + unit: ns + Kernel Time (Cycles): + avg: AVG($GRBM_GUI_ACTIVE_PER_XCD) + min: MIN($GRBM_GUI_ACTIVE_PER_XCD) + max: MAX($GRBM_GUI_ACTIVE_PER_XCD) + unit: Cycle + Instructions per wavefront: + avg: AVG((SQ_INSTS / SQ_WAVES)) + min: MIN((SQ_INSTS / SQ_WAVES)) + max: MAX((SQ_INSTS / SQ_WAVES)) + unit: Instr/wavefront + Wave Cycles: + avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom)) + min: MIN(((4 * SQ_WAVE_CYCLES) / $denom)) + max: MAX(((4 * SQ_WAVE_CYCLES) / $denom)) + unit: (Cycles + $normUnit) + Dependency Wait Cycles: + avg: AVG(((4 * SQ_WAIT_ANY) / $denom)) + min: MIN(((4 * SQ_WAIT_ANY) / $denom)) + max: MAX(((4 * SQ_WAIT_ANY) / $denom)) + unit: (Cycles + $normUnit) + Issue Wait Cycles: + avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom)) + min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom)) + max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom)) + unit: (Cycles + $normUnit) + Active Cycles: + avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom)) + min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom)) + max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom)) + unit: (Cycles + $normUnit) + Wavefront Occupancy: + avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) + unit: Wavefronts + coll_level: SQ_LEVEL_WAVES + metrics_description: + Grid Size: + plain: The total number of work-items (or, threads) launched as a part of the + kernel dispatch. In HIP, this is equivalent to the total grid size multiplied + by the total workgroup (or, block) size. + rst: The total number of work-items (or, threads) launched as a part of the + kernel dispatch. In HIP, this is equivalent to the total grid size multiplied + by the total workgroup (or, block) size. + unit: Work-Items + Workgroup Size: + plain: The total number of work-items (or, threads) in each workgroup (or, block) + launched as part of the kernel dispatch. In HIP, this is equivalent to the + total block size. + rst: The total number of work-items (or, threads) in each workgroup (or, block) + launched as part of the kernel dispatch. In HIP, this is equivalent to the + total block size. + unit: Work-Items + Total Wavefronts: + plain: "The total number of wavefronts launched as part of the kernel dispatch.\ + \ On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront\ + \ size is always 64 work-items. Thus, the total number of wavefronts should\ + \ be equivalent to the ceiling of grid size divided by 64." + rst: "The total number of wavefronts launched as part of the kernel dispatch.\ + \ On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront\ + \ size is always 64 work-items. Thus, the total number of wavefronts should\ + \ be equivalent to the ceiling of grid size divided by 64." + unit: Wavefronts + Saved Wavefronts: + plain: The total number of wavefronts saved at a context-save. + rst: The total number of wavefronts saved at a context-save. See `cwsr_enable + `_. + unit: Wavefronts + Restored Wavefronts: + plain: The total number of wavefronts restored from a context-save. + rst: The total number of wavefronts restored from a context-save. See `cwsr_enable + `_. + unit: Wavefronts + VGPRs: + plain: 'The number of architected vector general-purpose registers allocated + for the kernel, see VALU. Note: this may not exactly match the number of VGPRs + requested by the compiler due to allocation granularity.' + rst: 'The number of architected vector general-purpose registers allocated for the + kernel, see :ref:`VALU `. Note: this may not exactly match the + number of VGPRs requested by the compiler due to allocation granularity.' + unit: VGPRs + AGPRs: + plain: 'The number of accumulation vector general-purpose registers allocated + for the kernel, see AGPRs. Note: this may not exactly match the number of + AGPRs requested by the compiler due to allocation granularity.' + rst: 'The number of accumulation vector general-purpose registers allocated + for the kernel, see :ref:`AGPRs `. Note: this may not exactly match + the number of AGPRs requested by the compiler due to allocation granularity.' + unit: AGPRs + SGPRs: + plain: 'The number of scalar general-purpose registers allocated for the kernel, + see SALU. Note: this may not exactly match the number of SGPRs requested by + the compiler due to allocation granularity.' + rst: 'The number of scalar general-purpose registers allocated for the kernel, see + :ref:`SALU `. Note: this may not exactly match the number of + SGPRs requested by the compiler due to allocation granularity. plain' + unit: SGPRs + LDS Allocation: + plain: 'The number of bytes of LDS memory (or, shared memory) allocated for + this kernel. Note: This may also be larger than what was requested at compile + time due to both allocation granularity and dynamic per-dispatch LDS allocations.' + rst: 'The number of bytes of :doc:`LDS ` memory (or, shared memory) + allocated for this kernel. Note: This may also be larger than what was requested + at compile time due to both allocation granularity and dynamic per-dispatch + LDS allocations.' + unit: Bytes per workgroup + Scratch Allocation: + plain: The number of bytes of scratch memory requested per work-item for this + kernel. Scratch memory is used for stack memory on the accelerator, as well + as for register spills and restores. + rst: The number of bytes of :ref:`scratch memory ` requested per + work-item for this kernel. Scratch memory is used for stack memory on the + accelerator, as well as for register spills and restores. + unit: Bytes per work-item + Kernel Time: + plain: The total duration of the executed kernel. + rst: The total duration of the executed kernel. + unit: Nanoseconds + Kernel Time (Cycles): + plain: The total duration of the executed kernel in cycles. + rst: The total duration of the executed kernel in cycles. + unit: Cycles + Instructions per wavefront: + plain: The average number of instructions (of all types) executed per wavefront. + This is averaged over all wavefronts in a kernel dispatch. + rst: The average number of instructions (of all types) executed per wavefront. + This is averaged over all wavefronts in a kernel dispatch. + unit: Instructions per wavefront + Wave Cycles: + plain: The number of cycles a wavefront in the kernel dispatch spent resident + on a compute unit per normalization unit. This is averaged over all wavefronts + in a kernel dispatch. + rst: 'The number of cycles a wavefront in the kernel dispatch spent resident + on a compute unit per :ref:`normalization unit `. This is + averaged over all wavefronts in a kernel dispatch. Note: this should not + be directly compared to the kernel cycles above.' + unit: Cycles per normalization unit + Dependency Wait Cycles: + plain: The number of cycles a wavefront in the kernel dispatch spent resident + on a compute unit per normalization unit. This is averaged over all wavefronts + in a kernel dispatch. + rst: The number of cycles a wavefront in the kernel dispatch stalled waiting on + memory of any kind (e.g., instruction fetch, vector or scalar memory, etc.) + per :ref:`normalization unit `. This counter is incremented + at every cycle by *all* wavefronts on a CU stalled at a memory operation. As + such, it is most useful to get a sense of how waves were spending their time, + rather than identification of a precise limiter because another wave could + be actively executing while a wave is stalled. The sum of this metric, Issue + Wait Cycles and Active Cycles should be equal to the total Wave Cycles metric. + unit: Cycles per normalization unit + Issue Wait Cycles: + plain: The number of cycles a wavefront in the kernel dispatch was unable to + issue an instruction for any reason (e.g., execution pipe back-pressure, arbitration + loss, etc.) per normalization unit. This counter is incremented at every cycle + by all wavefronts on a CU unable to issue an instruction. As such, it is most + useful to get a sense of how waves were spending their time, rather than identification + of a precise limiter because another wave could be actively executing while + a wave is issue stalled. The sum of this metric, Dependency Wait Cycles and + Active Cycles should be equal to the total Wave Cycles metric. + rst: The number of cycles a wavefront in the kernel dispatch was unable to issue + an instruction for any reason (e.g., execution pipe back-pressure, arbitration + loss, etc.) per :ref:`normalization unit `. This counter + is incremented at every cycle by *all* wavefronts on a CU unable to issue + an instruction. As such, it is most useful to get a sense of how waves were spending + their time, rather than identification of a precise limiter because another + wave could be actively executing while a wave is issue stalled. The sum + of this metric, Dependency Wait Cycles and Active Cycles should be equal + to the total Wave Cycles metric. + unit: Cycles per normalization unit + Active Cycles: + plain: The average number of cycles a wavefront in the kernel dispatch was actively + executing instructions per normalization unit. This measurement is made on + a per-wavefront basis, and may include cycles that another wavefront spent + actively executing (on another execution unit, for example) or was stalled. + As such, it is most useful to get a sense of how waves were spending their + time, rather than identification of a precise limiter. The sum of this metric, + Issue Wait Cycles and Active Wait Cycles should be equal to the total Wave + Cycles metric. + rst: The average number of cycles a wavefront in the kernel dispatch was actively + executing instructions per :ref:`normalization unit `. + This measurement is made on a per-wavefront basis, and may include cycles + that another wavefront spent actively executing (on another execution unit, + for example) or was stalled. As such, it is most useful to get a sense of + how waves were spending their time, rather than identification of a precise + limiter. The sum of this metric, Issue Wait Cycles and Active Wait Cycles + should be equal to the total Wave Cycles metric. + unit: Cycles per normalization unit + Wavefront Occupancy: + plain: 'The time-averaged number of wavefronts resident on the accelerator over + the lifetime of the kernel. Note: this metric may be inaccurate for short-running + kernels (less than 1ms).' + rst: 'The time-averaged number of wavefronts resident on the accelerator over the + lifetime of the kernel. Note: this metric may be inaccurate for short-running + kernels (less than 1ms).' + unit: Wavefronts +- id: 1000 + title: Compute Units - Instruction Mix + data source: + - metric_table: + id: 1001 + title: Overall Instruction Mix + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + gfx90a: + VALU: + avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) + min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) + max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) + unit: (instr + $normUnit) + VMEM: + avg: AVG(((SQ_INSTS_VMEM - SQ_INSTS_FLAT_LDS_ONLY) / $denom)) + min: MIN(((SQ_INSTS_VMEM - SQ_INSTS_FLAT_LDS_ONLY) / $denom)) + max: MAX(((SQ_INSTS_VMEM - SQ_INSTS_FLAT_LDS_ONLY) / $denom)) + unit: (instr + $normUnit) + LDS: + avg: AVG((SQ_INSTS_LDS / $denom)) + min: MIN((SQ_INSTS_LDS / $denom)) + max: MAX((SQ_INSTS_LDS / $denom)) + unit: (instr + $normUnit) + MFMA: + avg: AVG((SQ_INSTS_MFMA / $denom)) + min: MIN((SQ_INSTS_MFMA / $denom)) + max: MAX((SQ_INSTS_MFMA / $denom)) + unit: (instr + $normUnit) + SALU: + avg: AVG((SQ_INSTS_SALU / $denom)) + min: MIN((SQ_INSTS_SALU / $denom)) + max: MAX((SQ_INSTS_SALU / $denom)) + unit: (instr + $normUnit) + SMEM: + avg: AVG((SQ_INSTS_SMEM / $denom)) + min: MIN((SQ_INSTS_SMEM / $denom)) + max: MAX((SQ_INSTS_SMEM / $denom)) + unit: (instr + $normUnit) + Branch: + avg: AVG((SQ_INSTS_BRANCH / $denom)) + min: MIN((SQ_INSTS_BRANCH / $denom)) + max: MAX((SQ_INSTS_BRANCH / $denom)) + unit: (instr + $normUnit) + gfx941: + VALU: + avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) + min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) + max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) + unit: (instr + $normUnit) + VMEM: + avg: AVG(((SQ_INSTS_VMEM) / $denom)) + min: MIN(((SQ_INSTS_VMEM) / $denom)) + max: MAX(((SQ_INSTS_VMEM) / $denom)) + unit: (instr + $normUnit) + LDS: + avg: AVG((SQ_INSTS_LDS / $denom)) + min: MIN((SQ_INSTS_LDS / $denom)) + max: MAX((SQ_INSTS_LDS / $denom)) + unit: (instr + $normUnit) + MFMA: + avg: AVG((SQ_INSTS_MFMA / $denom)) + min: MIN((SQ_INSTS_MFMA / $denom)) + max: MAX((SQ_INSTS_MFMA / $denom)) + unit: (instr + $normUnit) + SALU: + avg: AVG((SQ_INSTS_SALU / $denom)) + min: MIN((SQ_INSTS_SALU / $denom)) + max: MAX((SQ_INSTS_SALU / $denom)) + unit: (instr + $normUnit) + SMEM: + avg: AVG((SQ_INSTS_SMEM / $denom)) + min: MIN((SQ_INSTS_SMEM / $denom)) + max: MAX((SQ_INSTS_SMEM / $denom)) + unit: (instr + $normUnit) + Branch: + avg: AVG((SQ_INSTS_BRANCH / $denom)) + min: MIN((SQ_INSTS_BRANCH / $denom)) + max: MAX((SQ_INSTS_BRANCH / $denom)) + unit: (instr + $normUnit) + gfx940: + VALU: + avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) + min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) + max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) + unit: (instr + $normUnit) + VMEM: + avg: AVG(((SQ_INSTS_VMEM) / $denom)) + min: MIN(((SQ_INSTS_VMEM) / $denom)) + max: MAX(((SQ_INSTS_VMEM) / $denom)) + unit: (instr + $normUnit) + LDS: + avg: AVG((SQ_INSTS_LDS / $denom)) + min: MIN((SQ_INSTS_LDS / $denom)) + max: MAX((SQ_INSTS_LDS / $denom)) + unit: (instr + $normUnit) + MFMA: + avg: AVG((SQ_INSTS_MFMA / $denom)) + min: MIN((SQ_INSTS_MFMA / $denom)) + max: MAX((SQ_INSTS_MFMA / $denom)) + unit: (instr + $normUnit) + SALU: + avg: AVG((SQ_INSTS_SALU / $denom)) + min: MIN((SQ_INSTS_SALU / $denom)) + max: MAX((SQ_INSTS_SALU / $denom)) + unit: (instr + $normUnit) + SMEM: + avg: AVG((SQ_INSTS_SMEM / $denom)) + min: MIN((SQ_INSTS_SMEM / $denom)) + max: MAX((SQ_INSTS_SMEM / $denom)) + unit: (instr + $normUnit) + Branch: + avg: AVG((SQ_INSTS_BRANCH / $denom)) + min: MIN((SQ_INSTS_BRANCH / $denom)) + max: MAX((SQ_INSTS_BRANCH / $denom)) + unit: (instr + $normUnit) + gfx942: + VALU: + avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) + min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) + max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) + unit: (instr + $normUnit) + VMEM: + avg: AVG(((SQ_INSTS_VMEM) / $denom)) + min: MIN(((SQ_INSTS_VMEM) / $denom)) + max: MAX(((SQ_INSTS_VMEM) / $denom)) + unit: (instr + $normUnit) + LDS: + avg: AVG((SQ_INSTS_LDS / $denom)) + min: MIN((SQ_INSTS_LDS / $denom)) + max: MAX((SQ_INSTS_LDS / $denom)) + unit: (instr + $normUnit) + MFMA: + avg: AVG((SQ_INSTS_MFMA / $denom)) + min: MIN((SQ_INSTS_MFMA / $denom)) + max: MAX((SQ_INSTS_MFMA / $denom)) + unit: (instr + $normUnit) + SALU: + avg: AVG((SQ_INSTS_SALU / $denom)) + min: MIN((SQ_INSTS_SALU / $denom)) + max: MAX((SQ_INSTS_SALU / $denom)) + unit: (instr + $normUnit) + SMEM: + avg: AVG((SQ_INSTS_SMEM / $denom)) + min: MIN((SQ_INSTS_SMEM / $denom)) + max: MAX((SQ_INSTS_SMEM / $denom)) + unit: (instr + $normUnit) + Branch: + avg: AVG((SQ_INSTS_BRANCH / $denom)) + min: MIN((SQ_INSTS_BRANCH / $denom)) + max: MAX((SQ_INSTS_BRANCH / $denom)) + unit: (instr + $normUnit) + gfx950: + VALU: + avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) + min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) + max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) + unit: (instr + $normUnit) + VMEM: + avg: AVG(((SQ_INSTS_VMEM) / $denom)) + min: MIN(((SQ_INSTS_VMEM) / $denom)) + max: MAX(((SQ_INSTS_VMEM) / $denom)) + unit: (instr + $normUnit) + LDS: + avg: AVG((SQ_INSTS_LDS / $denom)) + min: MIN((SQ_INSTS_LDS / $denom)) + max: MAX((SQ_INSTS_LDS / $denom)) + unit: (instr + $normUnit) + MFMA: + avg: AVG((SQ_INSTS_MFMA / $denom)) + min: MIN((SQ_INSTS_MFMA / $denom)) + max: MAX((SQ_INSTS_MFMA / $denom)) + unit: (instr + $normUnit) + SALU: + avg: AVG((SQ_INSTS_SALU / $denom)) + min: MIN((SQ_INSTS_SALU / $denom)) + max: MAX((SQ_INSTS_SALU / $denom)) + unit: (instr + $normUnit) + SMEM: + avg: AVG((SQ_INSTS_SMEM / $denom)) + min: MIN((SQ_INSTS_SMEM / $denom)) + max: MAX((SQ_INSTS_SMEM / $denom)) + unit: (instr + $normUnit) + Branch: + avg: AVG((SQ_INSTS_BRANCH / $denom)) + min: MIN((SQ_INSTS_BRANCH / $denom)) + max: MAX((SQ_INSTS_BRANCH / $denom)) + unit: (instr + $normUnit) + gfx908: + LDS: + avg: AVG((SQ_INSTS_LDS / $denom)) + min: MIN((SQ_INSTS_LDS / $denom)) + max: MAX((SQ_INSTS_LDS / $denom)) + unit: (instr + $normUnit) + SALU: + avg: AVG((SQ_INSTS_SALU / $denom)) + min: MIN((SQ_INSTS_SALU / $denom)) + max: MAX((SQ_INSTS_SALU / $denom)) + unit: (instr + $normUnit) + SMEM: + avg: AVG((SQ_INSTS_SMEM / $denom)) + min: MIN((SQ_INSTS_SMEM / $denom)) + max: MAX((SQ_INSTS_SMEM / $denom)) + unit: (instr + $normUnit) + Branch: + avg: AVG((SQ_INSTS_BRANCH / $denom)) + min: MIN((SQ_INSTS_BRANCH / $denom)) + max: MAX((SQ_INSTS_BRANCH / $denom)) + unit: (instr + $normUnit) + - metric_table: + id: 1002 + title: VALU Arithmetic Instruction Mix + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + gfx90a: + INT32: + avg: AVG((SQ_INSTS_VALU_INT32 / $denom)) + min: MIN((SQ_INSTS_VALU_INT32 / $denom)) + max: MAX((SQ_INSTS_VALU_INT32 / $denom)) + unit: (instr + $normUnit) + INT64: + avg: AVG((SQ_INSTS_VALU_INT64 / $denom)) + min: MIN((SQ_INSTS_VALU_INT64 / $denom)) + max: MAX((SQ_INSTS_VALU_INT64 / $denom)) + unit: (instr + $normUnit) + F16-ADD: + avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom)) + unit: (instr + $normUnit) + F16-MUL: + avg: AVG((SQ_INSTS_VALU_MUL_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_MUL_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_MUL_F16 / $denom)) + unit: (instr + $normUnit) + F16-FMA: + avg: AVG((SQ_INSTS_VALU_FMA_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_FMA_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_FMA_F16 / $denom)) + unit: (instr + $normUnit) + F16-Trans: + avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom)) + unit: (instr + $normUnit) + F32-ADD: + avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom)) + unit: (instr + $normUnit) + F32-MUL: + avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom)) + unit: (instr + $normUnit) + F32-FMA: + avg: AVG((SQ_INSTS_VALU_FMA_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_FMA_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_FMA_F32 / $denom)) + unit: (instr + $normUnit) + F32-Trans: + avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom)) + unit: (instr + $normUnit) + F64-ADD: + avg: AVG((SQ_INSTS_VALU_ADD_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_ADD_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_ADD_F64 / $denom)) + unit: (instr + $normUnit) + F64-MUL: + avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom)) + unit: (instr + $normUnit) + F64-FMA: + avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom)) + unit: (instr + $normUnit) + F64-Trans: + avg: AVG((SQ_INSTS_VALU_TRANS_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_TRANS_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_TRANS_F64 / $denom)) + unit: (instr + $normUnit) + Conversion: + avg: AVG((SQ_INSTS_VALU_CVT / $denom)) + min: MIN((SQ_INSTS_VALU_CVT / $denom)) + max: MAX((SQ_INSTS_VALU_CVT / $denom)) + unit: (instr + $normUnit) + gfx941: + INT32: + avg: AVG((SQ_INSTS_VALU_INT32 / $denom)) + min: MIN((SQ_INSTS_VALU_INT32 / $denom)) + max: MAX((SQ_INSTS_VALU_INT32 / $denom)) + unit: (instr + $normUnit) + INT64: + avg: AVG((SQ_INSTS_VALU_INT64 / $denom)) + min: MIN((SQ_INSTS_VALU_INT64 / $denom)) + max: MAX((SQ_INSTS_VALU_INT64 / $denom)) + unit: (instr + $normUnit) + F16-ADD: + avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom)) + unit: (instr + $normUnit) + F16-MUL: + avg: AVG((SQ_INSTS_VALU_MUL_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_MUL_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_MUL_F16 / $denom)) + unit: (instr + $normUnit) + F16-FMA: + avg: AVG((SQ_INSTS_VALU_FMA_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_FMA_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_FMA_F16 / $denom)) + unit: (instr + $normUnit) + F16-Trans: + avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom)) + unit: (instr + $normUnit) + F32-ADD: + avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom)) + unit: (instr + $normUnit) + F32-MUL: + avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom)) + unit: (instr + $normUnit) + F32-FMA: + avg: AVG((SQ_INSTS_VALU_FMA_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_FMA_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_FMA_F32 / $denom)) + unit: (instr + $normUnit) + F32-Trans: + avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom)) + unit: (instr + $normUnit) + F64-ADD: + avg: AVG((SQ_INSTS_VALU_ADD_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_ADD_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_ADD_F64 / $denom)) + unit: (instr + $normUnit) + F64-MUL: + avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom)) + unit: (instr + $normUnit) + F64-FMA: + avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom)) + unit: (instr + $normUnit) + F64-Trans: + avg: AVG((SQ_INSTS_VALU_TRANS_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_TRANS_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_TRANS_F64 / $denom)) + unit: (instr + $normUnit) + Conversion: + avg: AVG((SQ_INSTS_VALU_CVT / $denom)) + min: MIN((SQ_INSTS_VALU_CVT / $denom)) + max: MAX((SQ_INSTS_VALU_CVT / $denom)) + unit: (instr + $normUnit) + gfx940: + INT32: + avg: AVG((SQ_INSTS_VALU_INT32 / $denom)) + min: MIN((SQ_INSTS_VALU_INT32 / $denom)) + max: MAX((SQ_INSTS_VALU_INT32 / $denom)) + unit: (instr + $normUnit) + INT64: + avg: AVG((SQ_INSTS_VALU_INT64 / $denom)) + min: MIN((SQ_INSTS_VALU_INT64 / $denom)) + max: MAX((SQ_INSTS_VALU_INT64 / $denom)) + unit: (instr + $normUnit) + F16-ADD: + avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom)) + unit: (instr + $normUnit) + F16-MUL: + avg: AVG((SQ_INSTS_VALU_MUL_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_MUL_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_MUL_F16 / $denom)) + unit: (instr + $normUnit) + F16-FMA: + avg: AVG((SQ_INSTS_VALU_FMA_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_FMA_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_FMA_F16 / $denom)) + unit: (instr + $normUnit) + F16-Trans: + avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom)) + unit: (instr + $normUnit) + F32-ADD: + avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom)) + unit: (instr + $normUnit) + F32-MUL: + avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom)) + unit: (instr + $normUnit) + F32-FMA: + avg: AVG((SQ_INSTS_VALU_FMA_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_FMA_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_FMA_F32 / $denom)) + unit: (instr + $normUnit) + F32-Trans: + avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom)) + unit: (instr + $normUnit) + F64-ADD: + avg: AVG((SQ_INSTS_VALU_ADD_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_ADD_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_ADD_F64 / $denom)) + unit: (instr + $normUnit) + F64-MUL: + avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom)) + unit: (instr + $normUnit) + F64-FMA: + avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom)) + unit: (instr + $normUnit) + F64-Trans: + avg: AVG((SQ_INSTS_VALU_TRANS_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_TRANS_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_TRANS_F64 / $denom)) + unit: (instr + $normUnit) + Conversion: + avg: AVG((SQ_INSTS_VALU_CVT / $denom)) + min: MIN((SQ_INSTS_VALU_CVT / $denom)) + max: MAX((SQ_INSTS_VALU_CVT / $denom)) + unit: (instr + $normUnit) + gfx942: + INT32: + avg: AVG((SQ_INSTS_VALU_INT32 / $denom)) + min: MIN((SQ_INSTS_VALU_INT32 / $denom)) + max: MAX((SQ_INSTS_VALU_INT32 / $denom)) + unit: (instr + $normUnit) + INT64: + avg: AVG((SQ_INSTS_VALU_INT64 / $denom)) + min: MIN((SQ_INSTS_VALU_INT64 / $denom)) + max: MAX((SQ_INSTS_VALU_INT64 / $denom)) + unit: (instr + $normUnit) + F16-ADD: + avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom)) + unit: (instr + $normUnit) + F16-MUL: + avg: AVG((SQ_INSTS_VALU_MUL_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_MUL_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_MUL_F16 / $denom)) + unit: (instr + $normUnit) + F16-FMA: + avg: AVG((SQ_INSTS_VALU_FMA_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_FMA_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_FMA_F16 / $denom)) + unit: (instr + $normUnit) + F16-Trans: + avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom)) + unit: (instr + $normUnit) + F32-ADD: + avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom)) + unit: (instr + $normUnit) + F32-MUL: + avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom)) + unit: (instr + $normUnit) + F32-FMA: + avg: AVG((SQ_INSTS_VALU_FMA_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_FMA_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_FMA_F32 / $denom)) + unit: (instr + $normUnit) + F32-Trans: + avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom)) + unit: (instr + $normUnit) + F64-ADD: + avg: AVG((SQ_INSTS_VALU_ADD_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_ADD_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_ADD_F64 / $denom)) + unit: (instr + $normUnit) + F64-MUL: + avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom)) + unit: (instr + $normUnit) + F64-FMA: + avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom)) + unit: (instr + $normUnit) + F64-Trans: + avg: AVG((SQ_INSTS_VALU_TRANS_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_TRANS_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_TRANS_F64 / $denom)) + unit: (instr + $normUnit) + Conversion: + avg: AVG((SQ_INSTS_VALU_CVT / $denom)) + min: MIN((SQ_INSTS_VALU_CVT / $denom)) + max: MAX((SQ_INSTS_VALU_CVT / $denom)) + unit: (instr + $normUnit) + gfx950: + INT32: + avg: AVG((SQ_INSTS_VALU_INT32 / $denom)) + min: MIN((SQ_INSTS_VALU_INT32 / $denom)) + max: MAX((SQ_INSTS_VALU_INT32 / $denom)) + unit: (instr + $normUnit) + INT64: + avg: AVG((SQ_INSTS_VALU_INT64 / $denom)) + min: MIN((SQ_INSTS_VALU_INT64 / $denom)) + max: MAX((SQ_INSTS_VALU_INT64 / $denom)) + unit: (instr + $normUnit) + F16-ADD: + avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom)) + unit: (instr + $normUnit) + F16-MUL: + avg: AVG((SQ_INSTS_VALU_MUL_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_MUL_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_MUL_F16 / $denom)) + unit: (instr + $normUnit) + F16-FMA: + avg: AVG((SQ_INSTS_VALU_FMA_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_FMA_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_FMA_F16 / $denom)) + unit: (instr + $normUnit) + F16-Trans: + avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom)) + unit: (instr + $normUnit) + F32-ADD: + avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom)) + unit: (instr + $normUnit) + F32-MUL: + avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom)) + unit: (instr + $normUnit) + F32-FMA: + avg: AVG((SQ_INSTS_VALU_FMA_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_FMA_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_FMA_F32 / $denom)) + unit: (instr + $normUnit) + F32-Trans: + avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom)) + unit: (instr + $normUnit) + F64-ADD: + avg: AVG((SQ_INSTS_VALU_ADD_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_ADD_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_ADD_F64 / $denom)) + unit: (instr + $normUnit) + F64-MUL: + avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom)) + unit: (instr + $normUnit) + F64-FMA: + avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom)) + unit: (instr + $normUnit) + F64-Trans: + avg: AVG((SQ_INSTS_VALU_TRANS_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_TRANS_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_TRANS_F64 / $denom)) + unit: (instr + $normUnit) + Conversion: + avg: AVG((SQ_INSTS_VALU_CVT / $denom)) + min: MIN((SQ_INSTS_VALU_CVT / $denom)) + max: MAX((SQ_INSTS_VALU_CVT / $denom)) + unit: (instr + $normUnit) + gfx908: {} + - metric_table: + id: 1003 + title: VMEM Instruction Mix + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + gfx90a: + Global/Generic Instr: + avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Global/Generic Read: + avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Global/Generic Write: + avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Global/Generic Atomic: + avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Instr: + avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Read: + avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Write: + avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Atomic: + avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + gfx941: + Global/Generic Instr: + avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Global/Generic Read: + avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Global/Generic Write: + avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Global/Generic Atomic: + avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Instr: + avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Read: + avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Write: + avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Atomic: + avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + gfx940: + Global/Generic Instr: + avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Global/Generic Read: + avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Global/Generic Write: + avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Global/Generic Atomic: + avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Instr: + avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Read: + avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Write: + avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Atomic: + avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + gfx942: + Global/Generic Instr: + avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Global/Generic Read: + avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Global/Generic Write: + avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Global/Generic Atomic: + avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Instr: + avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Read: + avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Write: + avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Atomic: + avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + gfx950: + Global/Generic Instr: + avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Global/Generic Read: + avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Global/Generic Write: + avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Global/Generic Atomic: + avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Instr: + avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Coalesceable Instr: + avg: AVG((TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Read: + avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Write: + avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Atomic: + avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + gfx908: + Global/Generic Instr: + avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Global/Generic Read: + avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Global/Generic Write: + avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Global/Generic Atomic: + avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Instr: + avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Read: + avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Write: + avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + Spill/Stack Atomic: + avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (instr + $normUnit) + - metric_table: + id: 1004 + title: MFMA Arithmetic Instruction Mix + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + gfx90a: + MFMA-I8: + avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom)) + unit: (instr + $normUnit) + MFMA-F16: + avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom)) + unit: (instr + $normUnit) + MFMA-BF16: + avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + unit: (instr + $normUnit) + MFMA-F32: + avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom)) + unit: (instr + $normUnit) + MFMA-F64: + avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom)) + unit: (instr + $normUnit) + gfx941: + MFMA-I8: + avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom)) + unit: (instr + $normUnit) + MFMA-F8: + avg: AVG((SQ_INSTS_VALU_MFMA_F8 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F8 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F8 / $denom)) + unit: (instr + $normUnit) + MFMA-F16: + avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom)) + unit: (instr + $normUnit) + MFMA-BF16: + avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + unit: (instr + $normUnit) + MFMA-F32: + avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom)) + unit: (instr + $normUnit) + MFMA-F64: + avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom)) + unit: (instr + $normUnit) + gfx940: + MFMA-I8: + avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom)) + unit: (instr + $normUnit) + MFMA-F8: + avg: AVG((SQ_INSTS_VALU_MFMA_F8 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F8 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F8 / $denom)) + unit: (instr + $normUnit) + MFMA-F16: + avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom)) + unit: (instr + $normUnit) + MFMA-BF16: + avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + unit: (instr + $normUnit) + MFMA-F32: + avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom)) + unit: (instr + $normUnit) + MFMA-F64: + avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom)) + unit: (instr + $normUnit) + gfx942: + MFMA-I8: + avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom)) + unit: (instr + $normUnit) + MFMA-F8: + avg: AVG((SQ_INSTS_VALU_MFMA_F8 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F8 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F8 / $denom)) + unit: (instr + $normUnit) + MFMA-F16: + avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom)) + unit: (instr + $normUnit) + MFMA-BF16: + avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + unit: (instr + $normUnit) + MFMA-F32: + avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom)) + unit: (instr + $normUnit) + MFMA-F64: + avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom)) + unit: (instr + $normUnit) + gfx950: + MFMA-I8: + avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom)) + unit: (instr + $normUnit) + MFMA-F8: + avg: AVG((SQ_INSTS_VALU_MFMA_F8 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F8 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F8 / $denom)) + unit: (instr + $normUnit) + MFMA-F16: + avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom)) + unit: (instr + $normUnit) + MFMA-BF16: + avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + unit: (instr + $normUnit) + MFMA-F32: + avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom)) + unit: (instr + $normUnit) + MFMA-F64: + avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom)) + unit: (instr + $normUnit) + MFMA-F6F4: + avg: AVG((SQ_INSTS_VALU_MFMA_F6F4 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F6F4 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F6F4 / $denom)) + unit: (instr + $normUnit) + gfx908: {} + metrics_description: + VALU: + plain: The total number of vector arithmetic logic unit (VALU) operations issued. + These are the workhorses of the compute unit, and are used to execute a wide + range of instruction types including floating point operations, non-uniform + address calculations, transcendental operations, integer operations, shifts, + conditional evaluation, etc. + rst: The total number of vector arithmetic logic unit (VALU) operations issued. + These are the workhorses of the :doc:`compute unit `, and are + used to execute a wide range of instruction types including floating point + operations, non-uniform address calculations, transcendental operations, + integer operations, shifts, conditional evaluation, etc. + unit: Instructions + VMEM: + plain: The total number of vector memory operations issued. These include most + loads, stores and atomic operations and all accesses to generic, global, private + and texture memory. + rst: The total number of vector memory operations issued. These include most loads, + stores and atomic operations and all accesses to :ref:`generic, global, private + and texture ` memory. + unit: Instructions + LDS: + plain: The total number of LDS (also known as shared memory) operations issued. + These include loads, stores, atomics, and HIP's __shfl operations. + rst: The total number of LDS (also known as shared memory) operations issued. These + include loads, stores, atomics, and HIP's ``__shfl`` operations. + unit: Instructions + MFMA: + plain: The total number of matrix fused multiply-add instructions issued. + rst: The total number of matrix fused multiply-add instructions issued. + unit: Instructions + SALU: + plain: The total number of scalar arithmetic logic unit (SALU) operations issued. + Typically these are used for address calculations, literal constants, and + other operations that are provably uniform across a wavefront. Although scalar + memory (SMEM) operations are issued by the SALU, they are counted separately + in this section. + rst: The total number of scalar arithmetic logic unit (SALU) operations issued. + Typically these are used for address calculations, literal constants, and + other operations that are provably uniform across a wavefront. Although scalar + memory (SMEM) operations are issued by the SALU, they are counted separately + in this section. + unit: Instructions + SMEM: + plain: The total number of scalar memory (SMEM) operations issued. These are + typically used for loading kernel arguments, base-pointers and loads from + HIP's __constant__ memory. + rst: The total number of scalar memory (SMEM) operations issued. These are typically + used for loading kernel arguments, base-pointers and loads from HIP's ``__constant__`` + memory. + unit: Instructions + Branch: + plain: The total number of branch operations issued. These typically consist + of jump or branch operations and are used to implement control flow. + rst: The total number of branch operations issued. These typically consist of jump + or branch operations and are used to implement control flow. + unit: Instructions + INT32: + plain: The total number of instructions operating on 32-bit integer operands + issued to the VALU per normalization unit. + rst: The total number of instructions operating on 32-bit integer operands issued + to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + INT64: + plain: The total number of instructions operating on 64-bit integer operands + issued to the VALU per normalization unit. + rst: The total number of instructions operating on 64-bit integer operands issued + to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F16-ADD: + plain: The total number of addition instructions operating on 16-bit floating-point + operands issued to the VALU per normalization unit. + rst: The total number of addition instructions operating on 16-bit floating-point + operands issued to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F16-MUL: + plain: The total number of multiplication instructions operating on 16-bit floating-point + operands issued to the VALU per normalization unit. + rst: The total number of multiplication instructions operating on 16-bit floating-point + operands issued to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F16-FMA: + plain: The total number of fused multiply-add instructions operating on 16-bit + floating-point operands issued to the VALU per normalization unit. + rst: The total number of fused multiply-add instructions operating on 16-bit floating-point + operands issued to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F16-Trans: + plain: The total number of transcendental instructions (e.g., sqrt) operating + on 16-bit floating-point operands issued to the VALU per normalization unit. + rst: The total number of transcendental instructions (e.g., `sqrt`) operating on + 16-bit floating-point operands issued to the VALU per :ref:`normalization + unit `. + unit: Instructions per normalization unit + F32-ADD: + plain: The total number of addition instructions operating on 32-bit floating-point + operands issued to the VALU per normalization unit. + rst: The total number of addition instructions operating on 32-bit floating-point + operands issued to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F32-MUL: + plain: The total number of multiplication instructions operating on 32-bit floating-point + operands issued to the VALU per normalization unit. + rst: The total number of multiplication instructions operating on 32-bit floating-point + operands issued to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F32-FMA: + plain: The total number of fused multiply-add instructions operating on 32-bit + floating-point operands issued to the VALU per normalization unit. + rst: The total number of fused multiply-add instructions operating on 32-bit floating-point + operands issued to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F32-Trans: + plain: The total number of transcendental instructions (such as sqrt) operating + on 32-bit floating-point operands issued to the VALU per normalization unit. + rst: The total number of transcendental instructions (such as ``sqrt``) operating + on 32-bit floating-point operands issued to the VALU per :ref:`normalization + unit `. + unit: Instructions per normalization unit + F64-ADD: + plain: The total number of addition instructions operating on 64-bit floating-point + operands issued to the VALU per normalization unit. + rst: The total number of addition instructions operating on 64-bit floating-point + operands issued to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F64-MUL: + plain: The total number of multiplication instructions operating on 64-bit floating-point + operands issued to the VALU per normalization unit. + rst: The total number of multiplication instructions operating on 64-bit floating-point + operands issued to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F64-FMA: + plain: The total number of fused multiply-add instructions operating on 64-bit + floating-point operands issued to the VALU per normalization unit. + rst: The total number of fused multiply-add instructions operating on 64-bit floating-point + operands issued to the VALU per :ref:`normalization unit `. + unit: Instructions per normalization unit + F64-Trans: + plain: The total number of transcendental instructions (such as sqrt) operating + on 64-bit floating-point operands issued to the VALU per normalization unit. + rst: The total number of transcendental instructions (such as `sqrt`) operating + on 64-bit floating-point operands issued to the VALU per :ref:`normalization + unit `. + unit: Instructions per normalization unit + Conversion: + plain: "The total number of type conversion instructions (such as converting\ + \ data to or from F32\u2194F64) issued to the VALU per normalization unit." + rst: "The total number of type conversion instructions (such as converting data\ + \ to or from F32\u2194F64) issued to the VALU per :ref:`normalization unit\ + \ `." + unit: Instructions per normalization unit + Global/Generic Instr: + plain: The total number of global & generic memory instructions executed on + all compute units on the accelerator, per normalization unit. + rst: The total number of global & generic memory instructions executed on all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + unit: Instructions per normalization unit + Global/Generic Read: + plain: The total number of global & generic memory read instructions executed + on all compute units on the accelerator, per normalization unit. + rst: The total number of global & generic memory read instructions executed + on all :doc:`compute units ` on the accelerator, per :ref:`normalization + unit `. + unit: Instructions per normalization unit + Global/Generic Write: + plain: The total number of global & generic memory write instructions executed + on all compute units on the accelerator, per normalization unit. + rst: The total number of global & generic memory write instructions executed on + all :doc:`compute units ` on the accelerator, per :ref:`normalization + unit `. + unit: Instructions per normalization unit + Global/Generic Atomic: + plain: The total number of global & generic memory atomic (with and without + return) instructions executed on all compute units on the accelerator, per + normalization unit. + rst: The total number of global & generic memory atomic (with and without return) + instructions executed on all :doc:`compute units ` on the accelerator, + per :ref:`normalization unit `. + unit: Instructions per normalization unit + Spill/Stack Instr: + plain: The total number of spill/stack memory instructions executed on all compute + units on the accelerator, per normalization unit. + rst: The total number of spill/stack memory instructions executed on all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + unit: Instructions per normalization unit + Spill/Stack Read: + plain: The total number of spill/stack memory read instructions executed on + all compute units on the accelerator, per normalization unit. + rst: The total number of spill/stack memory read instructions executed on all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + unit: Instructions per normalization unit + Spill/Stack Write: + plain: The total number of spill/stack memory write instructions executed on + all compute units on the accelerator, per normalization unit. + rst: The total number of spill/stack memory write instructions executed on all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + unit: Instructions per normalization unit + Spill/Stack Atomic: + plain: The total number of spill/stack memory atomic (with and without return) + instructions executed on all compute units on the accelerator, per normalization + unit. Typically unused as these memory operations are typically used to implement + thread-local storage. + rst: The total number of spill/stack memory atomic (with and without return) instructions + executed on all :doc:`compute units ` on the accelerator, per + :ref:`normalization unit `. Typically unused as these + memory operations are typically used to implement thread-local storage. + unit: Instructions per normalization unit + MFMA-I8: + plain: The total number of 8-bit integer MFMA instructions issued per normalization + unit. + rst: The total number of 8-bit integer :ref:`MFMA ` instructions issued + per :ref:`normalization unit `. + unit: Instructions per normalization unit + MFMA-F8: + plain: The total number of 8-bit floating point MFMA instructions issued per + normalization unit. This is supported in AMD Instinct MI300 series and later + only. + rst: The total number of 8-bit floating point :ref:`MFMA ` instructions issued + per :ref:`normalization unit `. This is supported in + AMD Instinct MI300 series and later only. + unit: Instructions per normalization unit + MFMA-F16: + plain: The total number of 16-bit floating point MFMA instructions issued per + normalization unit. + rst: The total number of 16-bit floating point :ref:`MFMA ` instructions + issued per :ref:`normalization unit `. + unit: Instructions per normalization unit + MFMA-BF16: + plain: The total number of 16-bit brain floating point MFMA instructions issued + per normalization unit. + rst: The total number of 16-bit brain floating point :ref:`MFMA ` instructions + issued per :ref:`normalization unit `. + unit: Instructions per normalization unit + MFMA-F32: + plain: The total number of 32-bit floating-point MFMA instructions issued per + normalization unit. + rst: The total number of 32-bit floating-point :ref:`MFMA ` instructions + issued per :ref:`normalization unit `. + unit: Instructions per normalization unit + MFMA-F64: + plain: The total number of 64-bit floating-point MFMA instructions issued per + normalization unit. + rst: The total number of 64-bit floating-point :ref:`MFMA ` instructions + issued per :ref:`normalization unit `. + unit: Instructions per normalization unit +- id: 1100 + title: Compute Units - Compute Pipeline + data source: + - metric_table: + id: 1101 + title: Compute Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + peak: Peak + pop: Pct of Peak + metric: + gfx90a: + VALU FLOPs: + value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) + / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) + VALU IOPs: + value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp + - Start_Timestamp))) + unit: GIOP + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / + (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) + * 64) * 2) / 1000)) + MFMA FLOPs (BF16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000)) + MFMA FLOPs (F16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000)) + MFMA FLOPs (F32): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + MFMA FLOPs (F64): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + MFMA IOPs (INT8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GIOP + peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000)) + gfx941: + VALU FLOPs: + value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) + / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) + VALU IOPs: + value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp + - Start_Timestamp))) + unit: GIOP + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / + (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) + * 64) * 2) / 1000)) + MFMA FLOPs (F8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + MFMA FLOPs (BF16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) + MFMA FLOPs (F16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) + MFMA FLOPs (F32): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + MFMA FLOPs (F64): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + MFMA IOPs (INT8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GIOP + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + gfx940: + VALU FLOPs: + value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) + / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) + VALU IOPs: + value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp + - Start_Timestamp))) + unit: GIOP + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / + (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) + * 64) * 2) / 1000)) + MFMA FLOPs (F8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + MFMA FLOPs (BF16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) + MFMA FLOPs (F16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) + MFMA FLOPs (F32): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + MFMA FLOPs (F64): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + MFMA IOPs (INT8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GIOP + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + gfx942: + VALU FLOPs: + value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) + / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) + VALU IOPs: + value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp + - Start_Timestamp))) + unit: GIOP + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / + (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) + * 64) * 2) / 1000)) + MFMA FLOPs (F8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + MFMA FLOPs (BF16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) + MFMA FLOPs (F16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) + MFMA FLOPs (F32): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + MFMA FLOPs (F64): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + MFMA IOPs (INT8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GIOP + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + gfx950: + VALU FLOPs: + value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) + / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) + VALU IOPs: + value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp + - Start_Timestamp))) + unit: GIOP + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / + (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) + * 64) * 2) / 1000)) + MFMA FLOPs (F8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) + MFMA FLOPs (BF16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + MFMA FLOPs (F16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + MFMA FLOPs (F32): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + MFMA FLOPs (F64): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000)) + MFMA FLOPs (F6F4): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 16834) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 16834) / 1000)) + MFMA IOPs (INT8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GIOP + peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) + pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) + gfx908: {} + - metric_table: + id: 1102 + title: Pipeline Statistics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + gfx90a: + IPC: + avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + unit: Instr/cycle + IPC (Issued): + avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) + / SQ_ACTIVE_INST_ANY)) + min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) + / SQ_ACTIVE_INST_ANY)) + max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) + / SQ_ACTIVE_INST_ANY)) + unit: Instr/cycle + SALU Utilization: + avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + VALU Utilization: + avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + VMEM Utilization: + avg: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + min: MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + max: MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + unit: pct + Branch Utilization: + avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + min: MIN((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + max: MAX((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + VALU Active Threads: + avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + unit: Threads + MFMA Utilization: + avg: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) + min: MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) + max: MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) + unit: pct + MFMA Instruction Cycles: + avg: AVG(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA + != 0) else None)) + min: MIN(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA + != 0) else None)) + max: MAX(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA + != 0) else None)) + unit: cycles/instr + VMEM Latency: + avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) + else None)) + min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) + else None)) + max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) + else None)) + unit: Cycles + coll_level: SQ_INST_LEVEL_VMEM + SMEM Latency: + avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) + else None)) + min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) + else None)) + max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) + else None)) + unit: Cycles + coll_level: SQ_INST_LEVEL_SMEM + gfx941: + IPC: + avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + unit: Instr/cycle + IPC (Issued): + avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) + / SQ_ACTIVE_INST_ANY)) + min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) + / SQ_ACTIVE_INST_ANY)) + max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) + / SQ_ACTIVE_INST_ANY)) + unit: Instr/cycle + SALU Utilization: + avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + VALU Utilization: + avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + VMEM Utilization: + avg: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + min: MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + max: MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + unit: pct + Branch Utilization: + avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + min: MIN((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + max: MAX((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + VALU Active Threads: + avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + unit: Threads + MFMA Utilization: + avg: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) + min: MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) + max: MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) + unit: pct + MFMA Instruction Cycles: + avg: AVG(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA + != 0) else None)) + min: MIN(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA + != 0) else None)) + max: MAX(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA + != 0) else None)) + unit: cycles/instr + VMEM Latency: + avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) + else None)) + min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) + else None)) + max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) + else None)) + unit: Cycles + coll_level: SQ_INST_LEVEL_VMEM + SMEM Latency: + avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) + else None)) + min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) + else None)) + max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) + else None)) + unit: Cycles + coll_level: SQ_INST_LEVEL_SMEM + gfx940: + IPC: + avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + unit: Instr/cycle + IPC (Issued): + avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) + / SQ_ACTIVE_INST_ANY)) + min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) + / SQ_ACTIVE_INST_ANY)) + max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) + / SQ_ACTIVE_INST_ANY)) + unit: Instr/cycle + SALU Utilization: + avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + VALU Utilization: + avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + VMEM Utilization: + avg: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + min: MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + max: MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + unit: pct + Branch Utilization: + avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + min: MIN((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + max: MAX((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + VALU Active Threads: + avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + unit: Threads + MFMA Utilization: + avg: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) + min: MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) + max: MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) + unit: pct + MFMA Instruction Cycles: + avg: AVG(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA + != 0) else None)) + min: MIN(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA + != 0) else None)) + max: MAX(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA + != 0) else None)) + unit: cycles/instr + VMEM Latency: + avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) + else None)) + min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) + else None)) + max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) + else None)) + unit: Cycles + coll_level: SQ_INST_LEVEL_VMEM + SMEM Latency: + avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) + else None)) + min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) + else None)) + max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) + else None)) + unit: Cycles + coll_level: SQ_INST_LEVEL_SMEM + gfx942: + IPC: + avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + unit: Instr/cycle + IPC (Issued): + avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) + / SQ_ACTIVE_INST_ANY)) + min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) + / SQ_ACTIVE_INST_ANY)) + max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) + / SQ_ACTIVE_INST_ANY)) + unit: Instr/cycle + SALU Utilization: + avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + VALU Utilization: + avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + VMEM Utilization: + avg: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + min: MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + max: MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + unit: pct + Branch Utilization: + avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + min: MIN((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + max: MAX((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + VALU Active Threads: + avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + unit: Threads + MFMA Utilization: + avg: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) + min: MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) + max: MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) + unit: pct + MFMA Instruction Cycles: + avg: AVG(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA + != 0) else None)) + min: MIN(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA + != 0) else None)) + max: MAX(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA + != 0) else None)) + unit: cycles/instr + VMEM Latency: + avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) + else None)) + min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) + else None)) + max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) + else None)) + unit: Cycles + coll_level: SQ_INST_LEVEL_VMEM + SMEM Latency: + avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) + else None)) + min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) + else None)) + max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) + else None)) + unit: Cycles + coll_level: SQ_INST_LEVEL_SMEM + gfx950: + IPC: + avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + unit: Instr/cycle + IPC (Issued): + avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) + / SQ_ACTIVE_INST_ANY)) + min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) + / SQ_ACTIVE_INST_ANY)) + max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) + / SQ_ACTIVE_INST_ANY)) + unit: Instr/cycle + SALU Utilization: + avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + VALU Utilization: + avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + VALU Co-Issue Efficiency: + avg: AVG((100 * SQ_ACTIVE_INST_VALU2) / (SQ_ACTIVE_INST_VALU - SQ_ACTIVE_INST_VALU2)) + min: MIN((100 * SQ_ACTIVE_INST_VALU2) / (SQ_ACTIVE_INST_VALU - SQ_ACTIVE_INST_VALU2)) + max: MAX((100 * SQ_ACTIVE_INST_VALU2) / (SQ_ACTIVE_INST_VALU - SQ_ACTIVE_INST_VALU2)) + unit: pct + VMEM Utilization: + avg: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + min: MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + max: MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) + / $cu_per_gpu)) + unit: pct + Branch Utilization: + avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + min: MIN((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + max: MAX((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + VALU Active Threads: + avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + unit: Threads + MFMA Utilization: + avg: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) + min: MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) + max: MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) + unit: pct + MFMA Instruction Cycles: + avg: AVG(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA + != 0) else None)) + min: MIN(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA + != 0) else None)) + max: MAX(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA + != 0) else None)) + unit: cycles/instr + VMEM Latency: + avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) + else None)) + min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) + else None)) + max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) + else None)) + unit: Cycles + coll_level: SQ_INST_LEVEL_VMEM + SMEM Latency: + avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) + else None)) + min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) + else None)) + max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) + else None)) + unit: Cycles + coll_level: SQ_INST_LEVEL_SMEM + gfx908: + IPC: + avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES)) + unit: Instr/cycle + IPC (Issued): + avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) + / SQ_ACTIVE_INST_ANY)) + min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) + / SQ_ACTIVE_INST_ANY)) + max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) + + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) + / SQ_ACTIVE_INST_ANY)) + unit: Instr/cycle + SALU Utilization: + avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + VALU Utilization: + avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct + VALU Active Threads: + avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU + != 0) else None)) + unit: Threads + - metric_table: + id: 1103 + title: Arithmetic Operations + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + gfx90a: + FLOPs (Total): + avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * + SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + min: MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * + SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + max: MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * + SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + unit: (OPs + $normUnit) + IOPs (Total): + avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 + * 512)) / $denom) + min: MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 + * 512)) / $denom) + max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 + * 512)) / $denom) + unit: (OPs + $normUnit) + F16 OPs: + avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) + min: MIN(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) + max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) + unit: (OPs + $normUnit) + BF16 OPs: + avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) + min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) + max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) + unit: (OPs + $normUnit) + F32 OPs: + avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + / $denom)) + min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + / $denom)) + max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + / $denom)) + unit: (OPs + $normUnit) + F64 OPs: + avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + unit: (OPs + $normUnit) + INT8 OPs: + avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + unit: (OPs + $normUnit) + gfx941: + FLOPs (Total): + avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * + SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 + * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 + * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) + min: MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * + SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 + * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 + * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) + max: MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * + SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 + * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 + * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) + unit: (OPs + $normUnit) + IOPs (Total): + avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 + * 512)) / $denom) + min: MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 + * 512)) / $denom) + max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 + * 512)) / $denom) + unit: (OPs + $normUnit) + F8 OPs: + avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) + min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) + max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) + unit: (OPs + $normUnit) + F16 OPs: + avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) + min: MIN(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) + max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) + unit: (OPs + $normUnit) + BF16 OPs: + avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) + min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) + max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) + unit: (OPs + $normUnit) + F32 OPs: + avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + / $denom)) + min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + / $denom)) + max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + / $denom)) + unit: (OPs + $normUnit) + F64 OPs: + avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + unit: (OPs + $normUnit) + INT8 OPs: + avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + unit: (OPs + $normUnit) + gfx940: + FLOPs (Total): + avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * + SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 + * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 + * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) + min: MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * + SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 + * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 + * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) + max: MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * + SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 + * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 + * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) + unit: (OPs + $normUnit) + IOPs (Total): + avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 + * 512)) / $denom) + min: MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 + * 512)) / $denom) + max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 + * 512)) / $denom) + unit: (OPs + $normUnit) + F8 OPs: + avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) + min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) + max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) + unit: (OPs + $normUnit) + F16 OPs: + avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) + min: MIN(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) + max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) + unit: (OPs + $normUnit) + BF16 OPs: + avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) + min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) + max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) + unit: (OPs + $normUnit) + F32 OPs: + avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + / $denom)) + min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + / $denom)) + max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + / $denom)) + unit: (OPs + $normUnit) + F64 OPs: + avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + unit: (OPs + $normUnit) + INT8 OPs: + avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + unit: (OPs + $normUnit) + gfx942: + FLOPs (Total): + avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * + SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 + * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 + * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) + min: MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * + SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 + * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 + * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) + max: MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * + SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 + * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 + * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) + unit: (OPs + $normUnit) + IOPs (Total): + avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 + * 512)) / $denom) + min: MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 + * 512)) / $denom) + max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 + * 512)) / $denom) + unit: (OPs + $normUnit) + F8 OPs: + avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) + min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) + max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) + unit: (OPs + $normUnit) + F16 OPs: + avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) + min: MIN(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) + max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) + unit: (OPs + $normUnit) + BF16 OPs: + avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) + min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) + max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) + unit: (OPs + $normUnit) + F32 OPs: + avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + / $denom)) + min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + / $denom)) + max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + / $denom)) + unit: (OPs + $normUnit) + F64 OPs: + avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + unit: (OPs + $normUnit) + INT8 OPs: + avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + unit: (OPs + $normUnit) + gfx950: + FLOPs (Total): + avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * + SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 + * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 + * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) + / $denom)) + min: MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * + SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 + * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 + * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) + / $denom)) + max: MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * + SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 + * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 + * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) + / $denom)) + unit: (OPs + $normUnit) + IOPs (Total): + avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 + * 512)) / $denom) + min: MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 + * 512)) / $denom) + max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 + * 512)) / $denom) + unit: (OPs + $normUnit) + F8 OPs: + avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) + min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) + max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) + unit: (OPs + $normUnit) + F16 OPs: + avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) + min: MIN(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) + max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) + unit: (OPs + $normUnit) + BF16 OPs: + avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) + min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) + max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) + unit: (OPs + $normUnit) + F32 OPs: + avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + / $denom)) + min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + / $denom)) + max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + / $denom)) + unit: (OPs + $normUnit) + F64 OPs: + avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) + / $denom)) + unit: (OPs + $normUnit) + F6F4 OPs: + avg: AVG((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom) + min: MIN((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom) + max: MAX((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom) + unit: (OPs + $normUnit) + INT8 OPs: + avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + unit: (OPs + $normUnit) + gfx908: {} + metrics_description: + VALU FLOPs: + plain: 'The total floating-point operations executed per second on the VALU. + This is also presented as a percent of the peak theoretical FLOPs achievable + on the specific accelerator. Note: this does not include any floating-point + operations from MFMA instructions.' + rst: 'The total floating-point operations executed per second on the :ref:`VALU + `. This is also presented as a percent of the peak theoretical + FLOPs achievable on the specific accelerator. Note: this does not include + any floating-point operations from :ref:`MFMA ` instructions.' + unit: GFLOPs + VALU IOPs: + plain: 'The total integer operations executed per second on the VALU. This is + also presented as a percent of the peak theoretical IOPs achievable on the + specific accelerator. Note: this does not include any integer operations from + MFMA instructions.' + rst: 'The total integer operations executed per second on the :ref:`VALU `. + This is also presented as a percent of the peak theoretical IOPs achievable + on the specific accelerator. Note: this does not include any integer operations + from :ref:`MFMA ` instructions.' + unit: GIOPs + MFMA FLOPs (BF16): + plain: 'The total number of 16-bit brain floating point MFMA operations executed + per second. Note: this does not include any 16-bit brain floating point operations + from VALU instructions. This is also presented as a percent of the peak theoretical + BF16 MFMA operations achievable on the specific accelerator.' + rst: 'The total number of 16-bit brain floating point :ref:`MFMA ` operations + executed per second. Note: this does not include any 16-bit brain floating + point operations from :ref:`VALU ` instructions. This is also + presented as a percent of the peak theoretical BF16 MFMA operations achievable + on the specific accelerator.' + unit: GFLOPs + MFMA FLOPs (F16): + plain: 'The total number of 16-bit floating point MFMA operations executed per + second. Note: this does not include any 16-bit floating point operations from + VALU instructions. This is also presented as a percent of the peak theoretical + F16 MFMA operations achievable on the specific accelerator.' + rst: 'The total number of 16-bit floating point :ref:`MFMA ` operations + executed per second. Note: this does not include any 16-bit floating point + operations from :ref:`VALU ` instructions. This is also presented + as a percent of the peak theoretical F16 MFMA operations achievable on the + specific accelerator.' + unit: GFLOPs + MFMA FLOPs (F32): + plain: 'The total number of 32-bit floating point MFMA operations executed per + second. Note: this does not include any 32-bit floating point operations from + VALU instructions. This is also presented as a percent of the peak theoretical + F32 MFMA operations achievable on the specific accelerator.' + rst: 'The total number of 32-bit floating point :ref:`MFMA ` operations + executed per second. Note: this does not include any 32-bit floating point + operations from :ref:`VALU ` instructions. This is also presented + as a percent of the peak theoretical F32 MFMA operations achievable on the + specific accelerator.' + unit: GFLOPs + MFMA FLOPs (F64): + plain: 'The total number of 64-bit floating point MFMA operations executed per + second. Note: this does not include any 64-bit floating point operations from + VALU instructions. This is also presented as a percent of the peak theoretical + F64 MFMA operations achievable on the specific accelerator.' + rst: 'The total number of 64-bit floating point :ref:`MFMA ` operations + executed per second. Note: this does not include any 64-bit floating point + operations from :ref:`VALU ` instructions. This is also presented + as a percent of the peak theoretical F64 MFMA operations achievable on the + specific accelerator. The total number of 64-bit floating point :ref:`MFMA + ` operations executed per second. Note: this does not include + any 64-bit floating point operations from :ref:`VALU ` instructions. + This is also presented as a percent of the peak theoretical F64 MFMA operations + achievable on the specific accelerator.' + unit: GFLOPs + MFMA IOPs (INT8): + plain: 'The total number of 8-bit integer MFMA operations executed per second. + Note: this does not include any 8-bit integer operations from VALU instructions. + This is also presented as a percent of the peak theoretical INT8 MFMA operations + achievable on the specific accelerator.' + rst: 'The total number of 8-bit integer :ref:`MFMA ` operations executed + per second. Note: this does not include any 8-bit integer operations from + :ref:`VALU ` instructions. This is also presented as a percent + of the peak theoretical INT8 MFMA operations achievable on the specific accelerator.' + unit: GFLOPs + IPC: + plain: The ratio of the total number of instructions executed on the CU over + the total active CU cycles. + rst: The ratio of the total number of instructions executed on the :doc:`CU + ` over the :ref:`total active CU cycles `. + unit: Instructions per cycle + IPC (Issued): + plain: The ratio of the total number of (non-internal) instructions issued over + the number of cycles where the scheduler was actively working on issuing instructions. + rst: The ratio of the total number of (non-:ref:`internal `) + instructions issued over the number of cycles where the :ref:`scheduler ` + was actively working on issuing instructions. Refer to the :ref:`Issued + IPC ` example for further detail. + unit: Instructions per cycle + SALU Utilization: + plain: Indicates what percent of the kernel's duration the SALU was busy executing + instructions. Computed as the ratio of the total number of cycles spent by + the scheduler issuing SALU / SMEM instructions over the total CU cycles. + rst: Indicates what percent of the kernel's duration the :ref:`SALU ` + was busy executing instructions. Computed as the ratio of the total number + of cycles spent by the :ref:`scheduler ` issuing SALU / :ref:`SMEM + ` instructions over the :ref:`total CU cycles `. + unit: Percent + VALU Utilization: + plain: Indicates what percent of the kernel's duration the VALU was busy executing + instructions. Does not include VMEM operations. Computed as the ratio of the + total number of cycles spent by the scheduler issuing VALU instructions over + the total CU cycles. + rst: Indicates what percent of the kernel's duration the :ref:`VALU ` + was busy executing instructions. Does not include :ref:`VMEM ` + operations. Computed as the ratio of the total number of cycles spent by + the :ref:`scheduler ` issuing VALU instructions over the + :ref:`total CU cycles `. + unit: Percent + VMEM Utilization: + plain: Indicates what percent of the kernel's duration the VMEM unit was busy + executing instructions, including both global/generic and spill/scratch operations + (see the VMEM instruction count metrics for more detail). Does not include + VALU operations. Computed as the ratio of the total number of cycles spent + by the scheduler issuing VMEM instructions over the total CU cycles. + rst: Indicates what percent of the kernel's duration the :ref:`VMEM ` + unit was busy executing instructions, including both global/generic and spill/scratch + operations (see the :ref:`VMEM instruction count metrics ` + for more detail). Does not include :ref:`VALU ` operations. Computed as + the ratio of the total number of cycles spent by the :ref:`scheduler ` + issuing VMEM instructions over the :ref:`total CU cycles `. + unit: Percent + Branch Utilization: + plain: Indicates what percent of the kernel's duration the branch unit was busy + executing instructions. Computed as the ratio of the total number of cycles + spent by the scheduler issuing branch instructions over the total CU cycles. + rst: Indicates what percent of the kernel's duration the :ref:`branch ` + unit was busy executing instructions. Computed as the ratio of the total + number of cycles spent by the :ref:`scheduler ` issuing branch + instructions over the :ref:`total CU cycles `. + unit: Percent + VALU Active Threads: + plain: Indicates the average level of divergence within a wavefront over the + lifetime of the kernel. The number of work-items that were active in a wavefront + during execution of each VALU instruction, time-averaged over all VALU instructions + run on all wavefronts in the kernel + rst: Indicates the average level of :ref:`divergence ` within a + wavefront over the lifetime of the kernel. The number of work-items that + were active in a wavefront during execution of each :ref:`VALU ` + instruction, time-averaged over all VALU instructions run on all wavefronts + in the kernel. + unit: Work-items + MFMA Utilization: + plain: Indicates what percent of the kernel's duration the MFMA unit was busy + executing instructions. Computed as the ratio of the total number of cycles + spent by the MFMA was busy over the total CU cycles. + rst: Indicates what percent of the kernel's duration the :ref:`MFMA ` + unit was busy executing instructions. Computed as the ratio of the total + number of cycles spent by the :ref:`MFMA ` was busy over the :ref:`total + CU cycles `. + unit: Percent + MFMA Instruction Cycles: + plain: The average duration of MFMA instructions in this kernel in cycles. Computed + as the ratio of the total number of cycles the MFMA unit was busy over the + total number of MFMA instructions. + rst: The average duration of :ref:`MFMA ` instructions in this kernel + in cycles. Computed as the ratio of the total number of cycles the MFMA unit + was busy over the total number of MFMA instructions. Compare to, for example, + the `AMD Matrix Instruction Calculator `_. + unit: Cycles per instruction + VMEM Latency: + plain: The average number of round-trip cycles (that is, from issue to data + return / acknowledgment) required for a VMEM instruction to complete. + rst: The average number of round-trip cycles (that is, from issue to data return + / acknowledgment) required for a VMEM instruction to complete. + unit: Cycles + SMEM Latency: + plain: The average number of round-trip cycles (that is, from issue to data + return / acknowledgment) required for a SMEM instruction to complete. + rst: The average number of round-trip cycles (that is, from issue to data return + / acknowledgment) required for a SMEM instruction to complete. + unit: Cycles + FLOPs (Total): + plain: The total number of floating-point operations executed on either the + VALU or MFMA units, per normalization unit. + rst: The total number of floating-point operations executed on either the :ref:`VALU + ` or :ref:`MFMA ` units, per :ref:`normalization unit + `. + unit: FLOP per normalization unit + IOPs (Total): + plain: The total number of integer operations executed on either the VALU or + MFMA units, per normalization unit. + rst: The total number of integer operations executed on either the :ref:`VALU + ` or :ref:`MFMA ` units, per :ref:`normalization unit + `. + unit: IOP per normalization unit + F16 OPs: + plain: The total number of 16-bit floating-point operations executed on either + the VALU or MFMA units, per normalization unit. + rst: The total number of 16-bit floating-point operations executed on either + the :ref:`VALU ` or :ref:`MFMA ` units, per :ref:`normalization + unit `. + unit: FLOP per normalization unit + BF16 OPs: + plain: The total number of 16-bit brain floating-point operations executed on + either the VALU or MFMA units, per normalization unit. + rst: 'The total number of 16-bit brain floating-point operations executed on + either the :ref:`VALU ` or :ref:`MFMA ` units, per :ref:`normalization + unit `. Note: on current CDNA accelerators, the VALU + has no native BF16 instructions.' + unit: FLOP per normalization unit + F32 OPs: + plain: The total number of 32-bit floating-point operations executed on either + the VALU or MFMA units, per normalization unit. + rst: The total number of 32-bit floating-point operations executed on either the + :ref:`VALU ` or :ref:`MFMA ` units, per :ref:`normalization + unit `. + unit: FLOP per normalization unit + F64 OPs: + plain: The total number of 64-bit floating-point operations executed on either + the VALU or MFMA units, per normalization unit. + rst: The total number of 64-bit floating-point operations executed on either the + :ref:`VALU ` or :ref:`MFMA ` units, per :ref:`normalization + unit `. + unit: FLOP per normalization unit + INT8 OPs: + plain: The total number of 8-bit integer operations executed on either the VALU + or MFMA units, per normalization unit. + rst: 'The total number of 8-bit integer operations executed on either the :ref:`VALU + ` or :ref:`MFMA ` units, per :ref:`normalization unit + `. Note: on current CDNA accelerators, the VALU has + no native INT8 instructions.' + unit: IOP per normalization unit +- id: 1200 + title: Local Data Share (LDS) + data source: + - metric_table: + id: 1201 + title: LDS Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + metric: + gfx90a: + Utilization: + value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: Pct of Peak + Access Rate: + value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: Pct of Peak + Theoretical Bandwidth: + value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * + 0.00128))) + unit: Pct of Peak + Bank Conflict Rate: + value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + unit: Pct of Peak + gfx941: + Utilization: + value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: Pct of Peak + Access Rate: + value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: Pct of Peak + Theoretical Bandwidth (% of Peak): + value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * + 0.00128))) + unit: Pct of Peak + Bank Conflict Rate: + value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + unit: Pct of Peak + gfx940: + Utilization: + value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: Pct of Peak + Access Rate: + value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: Pct of Peak + Theoretical Bandwidth (% of Peak): + value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * + 0.00128))) + unit: Pct of Peak + Bank Conflict Rate: + value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + unit: Pct of Peak + gfx942: + Utilization: + value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: Pct of Peak + Access Rate: + value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: Pct of Peak + Theoretical Bandwidth (% of Peak): + value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * + 0.00128))) + unit: Pct of Peak + Bank Conflict Rate: + value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + unit: Pct of Peak + gfx950: + Utilization: + value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: Pct of Peak + Access Rate: + value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: Pct of Peak + Theoretical Bandwidth (% of Peak): + value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * + 0.00128))) + unit: Pct of Peak + Bank Conflict Rate: + value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + unit: Pct of Peak + gfx908: + Utilization: + value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: Pct of Peak + Access Rate: + value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: Pct of Peak + Theoretical Bandwidth: + value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * + 0.00128))) + unit: Pct of Peak + Bank Conflict Rate: + value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + unit: Pct of Peak + comparable: false + cli_style: simple_bar + tui_style: simple_bar + - metric_table: + id: 1202 + title: LDS Statistics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + gfx90a: + LDS Instructions: + avg: AVG((SQ_INSTS_LDS / $denom)) + min: MIN((SQ_INSTS_LDS / $denom)) + max: MAX((SQ_INSTS_LDS / $denom)) + unit: (Instr + $normUnit) + Theoretical Bandwidth: + avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / $denom)) + min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / $denom)) + max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / $denom)) + unit: (Bytes + $normUnit) + LDS Latency: + avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) + else None)) + min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) + else None)) + max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) + else None)) + unit: Cycles + coll_level: SQ_INST_LEVEL_LDS + Bank Conflicts/Access: + avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + unit: Conflicts/Access + Index Accesses: + avg: AVG((SQ_LDS_IDX_ACTIVE / $denom)) + min: MIN((SQ_LDS_IDX_ACTIVE / $denom)) + max: MAX((SQ_LDS_IDX_ACTIVE / $denom)) + unit: (Cycles + $normUnit) + Atomic Return Cycles: + avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom)) + min: MIN((SQ_LDS_ATOMIC_RETURN / $denom)) + max: MAX((SQ_LDS_ATOMIC_RETURN / $denom)) + unit: (Cycles + $normUnit) + Bank Conflict: + avg: AVG((SQ_LDS_BANK_CONFLICT / $denom)) + min: MIN((SQ_LDS_BANK_CONFLICT / $denom)) + max: MAX((SQ_LDS_BANK_CONFLICT / $denom)) + unit: (Cycles + $normUnit) + Addr Conflict: + avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom)) + min: MIN((SQ_LDS_ADDR_CONFLICT / $denom)) + max: MAX((SQ_LDS_ADDR_CONFLICT / $denom)) + unit: (Cycles + $normUnit) + Unaligned Stall: + avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom)) + min: MIN((SQ_LDS_UNALIGNED_STALL / $denom)) + max: MAX((SQ_LDS_UNALIGNED_STALL / $denom)) + unit: (Cycles + $normUnit) + Mem Violations: + avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom)) + min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom)) + max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom)) + unit: (Accesses + $normUnit) + gfx941: + LDS Instructions: + avg: AVG((SQ_INSTS_LDS / $denom)) + min: MIN((SQ_INSTS_LDS / $denom)) + max: MAX((SQ_INSTS_LDS / $denom)) + unit: (Instr + $normUnit) + Theoretical Bandwidth: + avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / $denom)) + min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / $denom)) + max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / $denom)) + unit: (Bytes + $normUnit) + LDS Latency: + avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) + else None)) + min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) + else None)) + max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) + else None)) + unit: Cycles + coll_level: SQ_INST_LEVEL_LDS + Bank Conflicts/Access: + avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + unit: Conflicts/Access + Index Accesses: + avg: AVG((SQ_LDS_IDX_ACTIVE / $denom)) + min: MIN((SQ_LDS_IDX_ACTIVE / $denom)) + max: MAX((SQ_LDS_IDX_ACTIVE / $denom)) + unit: (Cycles + $normUnit) + Atomic Return Cycles: + avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom)) + min: MIN((SQ_LDS_ATOMIC_RETURN / $denom)) + max: MAX((SQ_LDS_ATOMIC_RETURN / $denom)) + unit: (Cycles + $normUnit) + Bank Conflict: + avg: AVG((SQ_LDS_BANK_CONFLICT / $denom)) + min: MIN((SQ_LDS_BANK_CONFLICT / $denom)) + max: MAX((SQ_LDS_BANK_CONFLICT / $denom)) + unit: (Cycles + $normUnit) + Addr Conflict: + avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom)) + min: MIN((SQ_LDS_ADDR_CONFLICT / $denom)) + max: MAX((SQ_LDS_ADDR_CONFLICT / $denom)) + unit: (Cycles + $normUnit) + Unaligned Stall: + avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom)) + min: MIN((SQ_LDS_UNALIGNED_STALL / $denom)) + max: MAX((SQ_LDS_UNALIGNED_STALL / $denom)) + unit: (Cycles + $normUnit) + Mem Violations: + avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom)) + min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom)) + max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom)) + unit: (Accesses + $normUnit) + gfx940: + LDS Instructions: + avg: AVG((SQ_INSTS_LDS / $denom)) + min: MIN((SQ_INSTS_LDS / $denom)) + max: MAX((SQ_INSTS_LDS / $denom)) + unit: (Instr + $normUnit) + Theoretical Bandwidth: + avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / $denom)) + min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / $denom)) + max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / $denom)) + unit: (Bytes + $normUnit) + LDS Latency: + avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) + else None)) + min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) + else None)) + max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) + else None)) + unit: Cycles + coll_level: SQ_INST_LEVEL_LDS + Bank Conflicts/Access: + avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + unit: Conflicts/Access + Index Accesses: + avg: AVG((SQ_LDS_IDX_ACTIVE / $denom)) + min: MIN((SQ_LDS_IDX_ACTIVE / $denom)) + max: MAX((SQ_LDS_IDX_ACTIVE / $denom)) + unit: (Cycles + $normUnit) + Atomic Return Cycles: + avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom)) + min: MIN((SQ_LDS_ATOMIC_RETURN / $denom)) + max: MAX((SQ_LDS_ATOMIC_RETURN / $denom)) + unit: (Cycles + $normUnit) + Bank Conflict: + avg: AVG((SQ_LDS_BANK_CONFLICT / $denom)) + min: MIN((SQ_LDS_BANK_CONFLICT / $denom)) + max: MAX((SQ_LDS_BANK_CONFLICT / $denom)) + unit: (Cycles + $normUnit) + Addr Conflict: + avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom)) + min: MIN((SQ_LDS_ADDR_CONFLICT / $denom)) + max: MAX((SQ_LDS_ADDR_CONFLICT / $denom)) + unit: (Cycles + $normUnit) + Unaligned Stall: + avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom)) + min: MIN((SQ_LDS_UNALIGNED_STALL / $denom)) + max: MAX((SQ_LDS_UNALIGNED_STALL / $denom)) + unit: (Cycles + $normUnit) + Mem Violations: + avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom)) + min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom)) + max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom)) + unit: (Accesses + $normUnit) + gfx942: + LDS Instructions: + avg: AVG((SQ_INSTS_LDS / $denom)) + min: MIN((SQ_INSTS_LDS / $denom)) + max: MAX((SQ_INSTS_LDS / $denom)) + unit: (Instr + $normUnit) + Theoretical Bandwidth: + avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / $denom)) + min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / $denom)) + max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / $denom)) + unit: (Bytes + $normUnit) + LDS Latency: + avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) + else None)) + min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) + else None)) + max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) + else None)) + unit: Cycles + coll_level: SQ_INST_LEVEL_LDS + Bank Conflicts/Access: + avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + unit: Conflicts/Access + Index Accesses: + avg: AVG((SQ_LDS_IDX_ACTIVE / $denom)) + min: MIN((SQ_LDS_IDX_ACTIVE / $denom)) + max: MAX((SQ_LDS_IDX_ACTIVE / $denom)) + unit: (Cycles + $normUnit) + Atomic Return Cycles: + avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom)) + min: MIN((SQ_LDS_ATOMIC_RETURN / $denom)) + max: MAX((SQ_LDS_ATOMIC_RETURN / $denom)) + unit: (Cycles + $normUnit) + Bank Conflict: + avg: AVG((SQ_LDS_BANK_CONFLICT / $denom)) + min: MIN((SQ_LDS_BANK_CONFLICT / $denom)) + max: MAX((SQ_LDS_BANK_CONFLICT / $denom)) + unit: (Cycles + $normUnit) + Addr Conflict: + avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom)) + min: MIN((SQ_LDS_ADDR_CONFLICT / $denom)) + max: MAX((SQ_LDS_ADDR_CONFLICT / $denom)) + unit: (Cycles + $normUnit) + Unaligned Stall: + avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom)) + min: MIN((SQ_LDS_UNALIGNED_STALL / $denom)) + max: MAX((SQ_LDS_UNALIGNED_STALL / $denom)) + unit: (Cycles + $normUnit) + Mem Violations: + avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom)) + min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom)) + max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom)) + unit: (Accesses + $normUnit) + gfx950: + LDS Instructions: + avg: AVG((SQ_INSTS_LDS / $denom)) + min: MIN((SQ_INSTS_LDS / $denom)) + max: MAX((SQ_INSTS_LDS / $denom)) + unit: (Instr + $normUnit) + LDS LOAD: + avg: AVG((SQ_INSTS_LDS_LOAD / $denom)) + min: MIN((SQ_INSTS_LDS_LOAD / $denom)) + max: MAX((SQ_INSTS_LDS_LOAD / $denom)) + unit: (instr + $normUnit) + LDS STORE: + avg: AVG((SQ_INSTS_LDS_STORE / $denom)) + min: MIN((SQ_INSTS_LDS_STORE / $denom)) + max: MAX((SQ_INSTS_LDS_STORE / $denom)) + unit: (instr + $normUnit) + LDS ATOMIC: + avg: AVG((SQ_INSTS_LDS_ATOMIC / $denom)) + min: MIN((SQ_INSTS_LDS_ATOMIC / $denom)) + max: MAX((SQ_INSTS_LDS_ATOMIC / $denom)) + unit: (instr + $normUnit) + LDS LOAD Bandwidth: + avg: AVG(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + min: MIN(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + max: MAX(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + units: Gbps + LDS STORE Bandwidth: + avg: AVG(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + min: MIN(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + max: MAX(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + units: Gbps + LDS ATOMIC Bandwidth: + avg: AVG(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + min: MIN(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + max: MAX(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + units: Gbps + Theoretical Bandwidth: + avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / $denom)) + min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / $denom)) + max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / $denom)) + unit: (Bytes + $normUnit) + LDS Latency: + avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) + else None)) + min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) + else None)) + max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) + else None)) + unit: Cycles + coll_level: SQ_INST_LEVEL_LDS + Bank Conflicts/Access: + avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + unit: Conflicts/Access + Index Accesses: + avg: AVG((SQ_LDS_IDX_ACTIVE / $denom)) + min: MIN((SQ_LDS_IDX_ACTIVE / $denom)) + max: MAX((SQ_LDS_IDX_ACTIVE / $denom)) + unit: (Cycles + $normUnit) + Atomic Return Cycles: + avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom)) + min: MIN((SQ_LDS_ATOMIC_RETURN / $denom)) + max: MAX((SQ_LDS_ATOMIC_RETURN / $denom)) + unit: (Cycles + $normUnit) + Bank Conflict: + avg: AVG((SQ_LDS_BANK_CONFLICT / $denom)) + min: MIN((SQ_LDS_BANK_CONFLICT / $denom)) + max: MAX((SQ_LDS_BANK_CONFLICT / $denom)) + unit: (Cycles + $normUnit) + Addr Conflict: + avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom)) + min: MIN((SQ_LDS_ADDR_CONFLICT / $denom)) + max: MAX((SQ_LDS_ADDR_CONFLICT / $denom)) + unit: (Cycles + $normUnit) + Unaligned Stall: + avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom)) + min: MIN((SQ_LDS_UNALIGNED_STALL / $denom)) + max: MAX((SQ_LDS_UNALIGNED_STALL / $denom)) + unit: (Cycles + $normUnit) + Mem Violations: + avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom)) + min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom)) + max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom)) + unit: (Accesses + $normUnit) + LDS Command FIFO Full Rate: + avg: AVG((SQ_LDS_CMD_FIFO_FULL / $denom)) + min: MIN((SQ_LDS_CMD_FIFO_FULL / $denom)) + max: MAX((SQ_LDS_CMD_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) + LDS Data FIFO Full Rate: + avg: AVG((SQ_LDS_DATA_FIFO_FULL / $denom)) + min: MIN((SQ_LDS_DATA_FIFO_FULL / $denom)) + max: MAX((SQ_LDS_DATA_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) + gfx908: + LDS Instructions: + avg: AVG((SQ_INSTS_LDS / $denom)) + min: MIN((SQ_INSTS_LDS / $denom)) + max: MAX((SQ_INSTS_LDS / $denom)) + unit: (Instr + $normUnit) + Theoretical Bandwidth: + avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / $denom)) + min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / $denom)) + max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) + / $denom)) + unit: (Bytes + $normUnit) + LDS Latency: + avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) + else None)) + min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) + else None)) + max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) + else None)) + unit: Cycles + coll_level: SQ_INST_LEVEL_LDS + Bank Conflicts/Access: + avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) + if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) + unit: Conflicts/Access + Index Accesses: + avg: AVG((SQ_LDS_IDX_ACTIVE / $denom)) + min: MIN((SQ_LDS_IDX_ACTIVE / $denom)) + max: MAX((SQ_LDS_IDX_ACTIVE / $denom)) + unit: (Cycles + $normUnit) + Atomic Return Cycles: + avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom)) + min: MIN((SQ_LDS_ATOMIC_RETURN / $denom)) + max: MAX((SQ_LDS_ATOMIC_RETURN / $denom)) + unit: (Cycles + $normUnit) + Bank Conflict: + avg: AVG((SQ_LDS_BANK_CONFLICT / $denom)) + min: MIN((SQ_LDS_BANK_CONFLICT / $denom)) + max: MAX((SQ_LDS_BANK_CONFLICT / $denom)) + unit: (Cycles + $normUnit) + Addr Conflict: + avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom)) + min: MIN((SQ_LDS_ADDR_CONFLICT / $denom)) + max: MAX((SQ_LDS_ADDR_CONFLICT / $denom)) + unit: (Cycles + $normUnit) + Unaligned Stall: + avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom)) + min: MIN((SQ_LDS_UNALIGNED_STALL / $denom)) + max: MAX((SQ_LDS_UNALIGNED_STALL / $denom)) + unit: (Cycles + $normUnit) + Mem Violations: + avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom)) + min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom)) + max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom)) + unit: (Accesses + $normUnit) + metrics_description: + Utilization: + plain: Indicates what percent of the kernel's duration the LDS was actively + executing instructions (including, but not limited to, load, store, atomic + and HIP's __shfl operations). Calculated as the ratio of the total number + of cycles LDS was active over the total CU cycles. + rst: Indicates what percent of the kernel's duration the :ref:`LDS ` was + actively executing instructions (including, but not limited to, load, store, + atomic and HIP's ``__shfl`` operations). Calculated as the ratio of the + total number of cycles LDS was active over the :ref:`total CU cycles `. + unit: Percent + Access Rate: + plain: Indicates the percentage of SIMDs in the VALU actively issuing LDS instructions, + averaged over the lifetime of the kernel. Calculated as the ratio of the total + number of cycles spent by the scheduler issuing LDS instructions over the + total CU cycles. + rst: Indicates the percentage of SIMDs in the :ref:`VALU ` [#lds-workload]_ + actively issuing LDS instructions, averaged over the lifetime of the kernel. + Calculated as the ratio of the total number of cycles spent by the :ref:`scheduler + ` issuing :ref:`LDS ` instructions over the :ref:`total + CU cycles `. + unit: Percent + Theoretical Bandwidth: + plain: Indicates the maximum amount of bytes that could have been loaded from, + stored to, or atomically updated in the LDS per normalization unit. Does not + take into account the execution mask of the wavefront when the instruction + was executed. + rst: Indicates the maximum amount of bytes that could have been loaded from, stored + to, or atomically updated in the LDS per :ref:`normalization unit `. + Does *not* take into account the execution mask of the wavefront when the + instruction was executed. See the :ref:`LDS bandwidth example ` + for more detail. + unit: Bytes per normalization unit + Bank Conflict Rate: + plain: Indicates the percentage of active LDS cycles that were spent servicing + bank conflicts. Calculated as the ratio of LDS cycles spent servicing bank + conflicts over the number of LDS cycles that would have been required to move + the same amount of data in an uncontended access. + rst: Indicates the percentage of active LDS cycles that were spent servicing bank + conflicts. Calculated as the ratio of LDS cycles spent servicing bank conflicts + over the number of LDS cycles that would have been required to move the same + amount of data in an uncontended access. [#lds-bank-conflict]_ + unit: Percent + LDS Instructions: + plain: The total number of LDS instructions (including, but not limited to, + read/write/atomics and HIP's __shfl instructions) executed per normalization + unit. + rst: The total number of LDS instructions (including, but not limited to, read/write/atomics + and HIP's ``__shfl`` instructions) executed per :ref:`normalization unit + `. + unit: Instructions per normalization unit + LDS Latency: + plain: The average number of round-trip cycles (i.e., from issue to data-return + / acknowledgment) required for an LDS instruction to complete. + rst: The average number of round-trip cycles (i.e., from issue to data-return / + acknowledgment) required for an LDS instruction to complete. + unit: Cycles + Bank Conflicts/Access: + plain: The ratio of the number of cycles spent in the LDS scheduler due to bank + conflicts (as determined by the conflict resolution hardware) to the base + number of cycles that would be spent in the LDS scheduler in a completely + uncontended case. This is the unnormalized form of the Bank Conflict Rate. + rst: The ratio of the number of cycles spent in the :ref:`LDS scheduler ` + due to bank conflicts (as determined by the conflict resolution hardware) + to the base number of cycles that would be spent in the LDS scheduler in + a completely uncontended case. This is the unnormalized form of the Bank + Conflict Rate. + unit: Conflicts per Access + Index Accesses: + plain: The total number of cycles spent in the LDS scheduler over all operations + per normalization unit. + rst: The total number of cycles spent in the :ref:`LDS scheduler ` over + all operations per :ref:`normalization unit `. + unit: Cycles per normalization unit + Atomic Return Cycles: + plain: The total number of cycles spent on LDS atomics with return per normalization + unit. + rst: The total number of cycles spent on LDS atomics with return per :ref:`normalization + unit `. + unit: Cycles per normalization unit + Bank Conflict: + plain: The total number of cycles spent in the LDS scheduler due to bank conflicts + (as determined by the conflict resolution hardware) per normalization unit. + rst: The total number of cycles spent in the :ref:`LDS scheduler ` due + to bank conflicts (as determined by the conflict resolution hardware) per + :ref:`normalization unit `. + unit: Cycles per normalization unit + Addr Conflict: + plain: The total number of cycles spent in the LDS scheduler due to address + conflicts (as determined by the conflict resolution hardware) per normalization + unit. + rst: The total number of cycles spent in the :ref:`LDS scheduler ` due + to address conflicts (as determined by the conflict resolution hardware) + per :ref:`normalization unit `. + unit: Cycles per normalization unit + Unaligned Stall: + plain: The total number of cycles spent in the LDS scheduler due to stalls from + non-dword aligned addresses per normalization unit. + rst: The total number of cycles spent in the :ref:`LDS scheduler ` due + to stalls from non-dword aligned addresses per :ref:`normalization unit `. + unit: Cycles per normalization unit + Mem Violations: + plain: "The total number of out-of-bounds accesses made to the LDS, per normalization\ + \ unit. This is unused and expected to be zero in most configurations for\ + \ modern CDNA\u2122 accelerators." + rst: "The total number of out-of-bounds accesses made to the LDS, per :ref:`normalization\ + \ unit `. This is unused and expected to be zero in\ + \ most configurations for modern CDNA\u2122 accelerators." + unit: Accesses per normalization unit +- id: 1300 + title: Instruction Cache + data source: + - metric_table: + id: 1301 + title: L1I Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + metric: + gfx90a: + Bandwidth: + value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * + (End_Timestamp - Start_Timestamp)))) + unit: Pct of Peak + Cache Hit Rate: + value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + unit: Pct of Peak + L1I-L2 Bandwidth: + value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu) + * (End_Timestamp - Start_Timestamp)))) + unit: Pct of Peak + gfx941: + Bandwidth: + value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * + (End_Timestamp - Start_Timestamp)))) + unit: Pct of Peak + Cache Hit Rate: + value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + unit: Pct of Peak + L1I-L2 Bandwidth: + value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu) + * (End_Timestamp - Start_Timestamp)))) + unit: Pct of Peak + gfx940: + Bandwidth: + value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * + (End_Timestamp - Start_Timestamp)))) + unit: Pct of Peak + Cache Hit Rate: + value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + unit: Pct of Peak + L1I-L2 Bandwidth: + value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu) + * (End_Timestamp - Start_Timestamp)))) + unit: Pct of Peak + gfx942: + Bandwidth: + value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * + (End_Timestamp - Start_Timestamp)))) + unit: Pct of Peak + Cache Hit Rate: + value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + unit: Pct of Peak + L1I-L2 Bandwidth: + value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu) + * (End_Timestamp - Start_Timestamp)))) + unit: Pct of Peak + gfx950: + Bandwidth: + value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * + (End_Timestamp - Start_Timestamp)))) + unit: Pct of Peak + Cache Hit Rate: + value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + unit: Pct of Peak + L1I-L2 Bandwidth: + value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu) + * (End_Timestamp - Start_Timestamp)))) + unit: Pct of Peak + gfx908: + Bandwidth: + value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * + (End_Timestamp - Start_Timestamp)))) + unit: Pct of Peak + Cache Hit Rate: + value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + unit: Pct of Peak + L1I-L2 Bandwidth: + value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu) + * (End_Timestamp - Start_Timestamp)))) + unit: Pct of Peak + comparable: false + cli_style: simple_bar + tui_style: simple_bar + - metric_table: + id: 1302 + title: L1I cache accesses + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + gfx90a: + Req: + avg: AVG((SQC_ICACHE_REQ / $denom)) + min: MIN((SQC_ICACHE_REQ / $denom)) + max: MAX((SQC_ICACHE_REQ / $denom)) + unit: (Req + $normUnit) + Hits: + avg: AVG((SQC_ICACHE_HITS / $denom)) + min: MIN((SQC_ICACHE_HITS / $denom)) + max: MAX((SQC_ICACHE_HITS / $denom)) + unit: (Hits + $normUnit) + Misses - Non Duplicated: + avg: AVG((SQC_ICACHE_MISSES / $denom)) + min: MIN((SQC_ICACHE_MISSES / $denom)) + max: MAX((SQC_ICACHE_MISSES / $denom)) + unit: (Misses + $normUnit) + Misses - Duplicated: + avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom)) + min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom)) + max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom)) + unit: (Misses + $normUnit) + Cache Hit Rate: + avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + unit: pct + Instruction Fetch Latency: + avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + unit: Cycles + coll_level: SQ_IFETCH_LEVEL + gfx941: + Req: + avg: AVG((SQC_ICACHE_REQ / $denom)) + min: MIN((SQC_ICACHE_REQ / $denom)) + max: MAX((SQC_ICACHE_REQ / $denom)) + unit: (Req + $normUnit) + Hits: + avg: AVG((SQC_ICACHE_HITS / $denom)) + min: MIN((SQC_ICACHE_HITS / $denom)) + max: MAX((SQC_ICACHE_HITS / $denom)) + unit: (Hits + $normUnit) + Misses - Non Duplicated: + avg: AVG((SQC_ICACHE_MISSES / $denom)) + min: MIN((SQC_ICACHE_MISSES / $denom)) + max: MAX((SQC_ICACHE_MISSES / $denom)) + unit: (Misses + $normUnit) + Misses - Duplicated: + avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom)) + min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom)) + max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom)) + unit: (Misses + $normUnit) + Cache Hit Rate: + avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + unit: pct + Instruction Fetch Latency: + avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + unit: Cycles + coll_level: SQ_IFETCH_LEVEL + gfx940: + Req: + avg: AVG((SQC_ICACHE_REQ / $denom)) + min: MIN((SQC_ICACHE_REQ / $denom)) + max: MAX((SQC_ICACHE_REQ / $denom)) + unit: (Req + $normUnit) + Hits: + avg: AVG((SQC_ICACHE_HITS / $denom)) + min: MIN((SQC_ICACHE_HITS / $denom)) + max: MAX((SQC_ICACHE_HITS / $denom)) + unit: (Hits + $normUnit) + Misses - Non Duplicated: + avg: AVG((SQC_ICACHE_MISSES / $denom)) + min: MIN((SQC_ICACHE_MISSES / $denom)) + max: MAX((SQC_ICACHE_MISSES / $denom)) + unit: (Misses + $normUnit) + Misses - Duplicated: + avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom)) + min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom)) + max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom)) + unit: (Misses + $normUnit) + Cache Hit Rate: + avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + unit: pct + Instruction Fetch Latency: + avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + unit: Cycles + coll_level: SQ_IFETCH_LEVEL + gfx942: + Req: + avg: AVG((SQC_ICACHE_REQ / $denom)) + min: MIN((SQC_ICACHE_REQ / $denom)) + max: MAX((SQC_ICACHE_REQ / $denom)) + unit: (Req + $normUnit) + Hits: + avg: AVG((SQC_ICACHE_HITS / $denom)) + min: MIN((SQC_ICACHE_HITS / $denom)) + max: MAX((SQC_ICACHE_HITS / $denom)) + unit: (Hits + $normUnit) + Misses - Non Duplicated: + avg: AVG((SQC_ICACHE_MISSES / $denom)) + min: MIN((SQC_ICACHE_MISSES / $denom)) + max: MAX((SQC_ICACHE_MISSES / $denom)) + unit: (Misses + $normUnit) + Misses - Duplicated: + avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom)) + min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom)) + max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom)) + unit: (Misses + $normUnit) + Cache Hit Rate: + avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + unit: pct + Instruction Fetch Latency: + avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + unit: Cycles + coll_level: SQ_IFETCH_LEVEL + gfx950: + Req: + avg: AVG((SQC_ICACHE_REQ / $denom)) + min: MIN((SQC_ICACHE_REQ / $denom)) + max: MAX((SQC_ICACHE_REQ / $denom)) + unit: (Req + $normUnit) + Hits: + avg: AVG((SQC_ICACHE_HITS / $denom)) + min: MIN((SQC_ICACHE_HITS / $denom)) + max: MAX((SQC_ICACHE_HITS / $denom)) + unit: (Hits + $normUnit) + Misses - Non Duplicated: + avg: AVG((SQC_ICACHE_MISSES / $denom)) + min: MIN((SQC_ICACHE_MISSES / $denom)) + max: MAX((SQC_ICACHE_MISSES / $denom)) + unit: (Misses + $normUnit) + Misses - Duplicated: + avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom)) + min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom)) + max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom)) + unit: (Misses + $normUnit) + Cache Hit Rate: + avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + unit: pct + Instruction Fetch Latency: + avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + unit: Cycles + coll_level: SQ_IFETCH_LEVEL + gfx908: + Req: + avg: AVG((SQC_ICACHE_REQ / $denom)) + min: MIN((SQC_ICACHE_REQ / $denom)) + max: MAX((SQC_ICACHE_REQ / $denom)) + unit: (Req + $normUnit) + Hits: + avg: AVG((SQC_ICACHE_HITS / $denom)) + min: MIN((SQC_ICACHE_HITS / $denom)) + max: MAX((SQC_ICACHE_HITS / $denom)) + unit: (Hits + $normUnit) + Misses - Non Duplicated: + avg: AVG((SQC_ICACHE_MISSES / $denom)) + min: MIN((SQC_ICACHE_MISSES / $denom)) + max: MAX((SQC_ICACHE_MISSES / $denom)) + unit: (Misses + $normUnit) + Misses - Duplicated: + avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom)) + min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom)) + max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom)) + unit: (Misses + $normUnit) + Cache Hit Rate: + avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) + + SQC_ICACHE_MISSES_DUPLICATE))) + unit: pct + Instruction Fetch Latency: + avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) + unit: Cycles + coll_level: SQ_IFETCH_LEVEL + - metric_table: + id: 1303 + title: L1I <-> L2 interface + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + gfx90a: + L1I-L2 Bandwidth: + avg: AVG(((SQC_TC_INST_REQ * 64) / $denom)) + min: MIN(((SQC_TC_INST_REQ * 64) / $denom)) + max: MAX(((SQC_TC_INST_REQ * 64) / $denom)) + unit: (Bytes + $normUnit) + gfx941: + L1I-L2 Bandwidth: + avg: AVG(((SQC_TC_INST_REQ * 64) / $denom)) + min: MIN(((SQC_TC_INST_REQ * 64) / $denom)) + max: MAX(((SQC_TC_INST_REQ * 64) / $denom)) + unit: (Bytes + $normUnit) + gfx940: + L1I-L2 Bandwidth: + avg: AVG(((SQC_TC_INST_REQ * 64) / $denom)) + min: MIN(((SQC_TC_INST_REQ * 64) / $denom)) + max: MAX(((SQC_TC_INST_REQ * 64) / $denom)) + unit: (Bytes + $normUnit) + gfx942: + L1I-L2 Bandwidth: + avg: AVG(((SQC_TC_INST_REQ * 64) / $denom)) + min: MIN(((SQC_TC_INST_REQ * 64) / $denom)) + max: MAX(((SQC_TC_INST_REQ * 64) / $denom)) + unit: (Bytes + $normUnit) + gfx950: + L1I-L2 Bandwidth: + avg: AVG(((SQC_TC_INST_REQ * 64) / $denom)) + min: MIN(((SQC_TC_INST_REQ * 64) / $denom)) + max: MAX(((SQC_TC_INST_REQ * 64) / $denom)) + unit: (Bytes + $normUnit) + gfx908: + L1I-L2 Bandwidth: + avg: AVG(((SQC_TC_INST_REQ * 64) / $denom)) + min: MIN(((SQC_TC_INST_REQ * 64) / $denom)) + max: MAX(((SQC_TC_INST_REQ * 64) / $denom)) + unit: (Bytes + $normUnit) + metrics_description: + Bandwidth: + plain: The number of bytes looked up in the L1I cache, as a percent of the peak + theoretical bandwidth. Calculated as the ratio of L1I requests over the total + L1I cycles. + rst: The number of bytes looked up in the L1I cache, as a percent of the peak theoretical + bandwidth. Calculated as the ratio of L1I requests over the :ref:`total L1I + cycles `. + unit: Percent + Cache Hit Rate: + plain: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded + line the cache. Calculated as the ratio of the number of L1I requests that + hit over the number of all L1I requests. + rst: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded line + the cache. Calculated as the ratio of the number of L1I requests that hit + over the number of all L1I requests. + unit: Percent + L1I-L2 Bandwidth: + plain: "The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth\ + \ achieved. Calculated as the ratio of the total number of requests from the\ + \ L1I to the L2 cache over the total L1I-L2 interface cycles." + rst: "The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth\ + \ achieved. Calculated as the ratio of the total number of requests from\ + \ the L1I to the L2 cache over the :ref:`total L1I-L2 interface cycles `." + unit: Percent + Req: + plain: The total number of requests made to the L1I per normalization-unit + rst: The total number of requests made to the L1I per normalization-unit + unit: Requests per normalization unit + Hits: + plain: The total number of L1I requests that hit on a previously loaded cache + line, per normalization-unit. + rst: The total number of L1I requests that hit on a previously loaded cache line, + per :ref:`normalization-unit `. + unit: Requests per normalization unit + Misses - Non Duplicated: + plain: The total number of L1I requests that missed on a cache line that were + not already pending due to another request, per normalization-unit. + rst: The total number of L1I requests that missed on a cache line that *were + not* already pending due to another request, per :ref:`normalization-unit + `. See note in :ref:`desc-l1i-sol` for more detail. + unit: Requests per normalization unit + Misses - Duplicated: + plain: The total number of L1I requests that missed on a cache line that were + already pending due to another request, per normalization-unit. + rst: The total number of L1I requests that missed on a cache line that *were* already + pending due to another request, per :ref:`normalization-unit `. + See note in :ref:`desc-l1i-sol` for more detail. + unit: Requests per normalization unit + Instruction Fetch Latency: + plain: The average number of cycles spent to fetch instructions to a CU. + rst: The average number of cycles spent to fetch instructions to a :doc:`CU + `. + unit: Cycles +- id: 1400 + title: Scalar L1 Data Cache + data source: + - metric_table: + id: 1401 + title: Scalar L1D Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + metric: + gfx90a: + Bandwidth: + value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * + (End_Timestamp - Start_Timestamp)))) + unit: Pct of Peak + Cache Hit Rate: + value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES + + SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + unit: Pct of Peak + sL1D-L2 BW: + value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp))) + unit: Pct of Peak + gfx941: + Bandwidth: + value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * + (End_Timestamp - Start_Timestamp)))) + unit: Pct of Peak + Cache Hit Rate: + value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES + + SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + unit: Pct of Peak + sL1D-L2 BW: + value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp))) + unit: Pct of Peak + gfx940: + Bandwidth: + value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * + (End_Timestamp - Start_Timestamp)))) + unit: Pct of Peak + Cache Hit Rate: + value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES + + SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + unit: Pct of Peak + sL1D-L2 BW: + value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp))) + unit: Pct of Peak + gfx942: + Bandwidth: + value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * + (End_Timestamp - Start_Timestamp)))) + unit: Pct of Peak + Cache Hit Rate: + value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES + + SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + unit: Pct of Peak + sL1D-L2 BW: + value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp))) + unit: Pct of Peak + gfx950: + Bandwidth: + value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * + (End_Timestamp - Start_Timestamp)))) + unit: Pct of Peak + Cache Hit Rate: + value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES + + SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + unit: Pct of Peak + sL1D-L2 BW: + value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp))) + unit: Pct of Peak + gfx908: + Bandwidth: + value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * + (End_Timestamp - Start_Timestamp)))) + unit: Pct of Peak + Cache Hit Rate: + value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES + + SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + unit: Pct of Peak + sL1D-L2 BW: + value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp))) + unit: Pct of Peak + comparable: false + cli_style: simple_bar + tui_style: simple_bar + - metric_table: + id: 1402 + title: Scalar L1D cache accesses + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + gfx90a: + Req: + avg: AVG((SQC_DCACHE_REQ / $denom)) + min: MIN((SQC_DCACHE_REQ / $denom)) + max: MAX((SQC_DCACHE_REQ / $denom)) + unit: (Req + $normUnit) + Hits: + avg: AVG((SQC_DCACHE_HITS / $denom)) + min: MIN((SQC_DCACHE_HITS / $denom)) + max: MAX((SQC_DCACHE_HITS / $denom)) + unit: (Req + $normUnit) + Misses - Non Duplicated: + avg: AVG((SQC_DCACHE_MISSES / $denom)) + min: MIN((SQC_DCACHE_MISSES / $denom)) + max: MAX((SQC_DCACHE_MISSES / $denom)) + unit: (Req + $normUnit) + Misses- Duplicated: + avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom)) + min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom)) + max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom)) + unit: (Req + $normUnit) + Cache Hit Rate: + avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + unit: pct + Read Req (Total): + avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) + min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) + max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG((SQC_DCACHE_ATOMIC / $denom)) + min: MIN((SQC_DCACHE_ATOMIC / $denom)) + max: MAX((SQC_DCACHE_ATOMIC / $denom)) + unit: (Req + $normUnit) + Read Req (1 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_1 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_1 / $denom)) + unit: (Req + $normUnit) + Read Req (2 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_2 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_2 / $denom)) + unit: (Req + $normUnit) + Read Req (4 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_4 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_4 / $denom)) + unit: (Req + $normUnit) + Read Req (8 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_8 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_8 / $denom)) + unit: (Req + $normUnit) + Read Req (16 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_16 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_16 / $denom)) + unit: (Req + $normUnit) + gfx941: + Req: + avg: AVG((SQC_DCACHE_REQ / $denom)) + min: MIN((SQC_DCACHE_REQ / $denom)) + max: MAX((SQC_DCACHE_REQ / $denom)) + unit: (Req + $normUnit) + Hits: + avg: AVG((SQC_DCACHE_HITS / $denom)) + min: MIN((SQC_DCACHE_HITS / $denom)) + max: MAX((SQC_DCACHE_HITS / $denom)) + unit: (Req + $normUnit) + Misses - Non Duplicated: + avg: AVG((SQC_DCACHE_MISSES / $denom)) + min: MIN((SQC_DCACHE_MISSES / $denom)) + max: MAX((SQC_DCACHE_MISSES / $denom)) + unit: (Req + $normUnit) + Misses- Duplicated: + avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom)) + min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom)) + max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom)) + unit: (Req + $normUnit) + Cache Hit Rate: + avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + unit: pct + Read Req (Total): + avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) + min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) + max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG((SQC_DCACHE_ATOMIC / $denom)) + min: MIN((SQC_DCACHE_ATOMIC / $denom)) + max: MAX((SQC_DCACHE_ATOMIC / $denom)) + unit: (Req + $normUnit) + Read Req (1 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_1 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_1 / $denom)) + unit: (Req + $normUnit) + Read Req (2 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_2 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_2 / $denom)) + unit: (Req + $normUnit) + Read Req (4 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_4 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_4 / $denom)) + unit: (Req + $normUnit) + Read Req (8 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_8 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_8 / $denom)) + unit: (Req + $normUnit) + Read Req (16 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_16 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_16 / $denom)) + unit: (Req + $normUnit) + gfx940: + Req: + avg: AVG((SQC_DCACHE_REQ / $denom)) + min: MIN((SQC_DCACHE_REQ / $denom)) + max: MAX((SQC_DCACHE_REQ / $denom)) + unit: (Req + $normUnit) + Hits: + avg: AVG((SQC_DCACHE_HITS / $denom)) + min: MIN((SQC_DCACHE_HITS / $denom)) + max: MAX((SQC_DCACHE_HITS / $denom)) + unit: (Req + $normUnit) + Misses - Non Duplicated: + avg: AVG((SQC_DCACHE_MISSES / $denom)) + min: MIN((SQC_DCACHE_MISSES / $denom)) + max: MAX((SQC_DCACHE_MISSES / $denom)) + unit: (Req + $normUnit) + Misses- Duplicated: + avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom)) + min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom)) + max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom)) + unit: (Req + $normUnit) + Cache Hit Rate: + avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + unit: pct + Read Req (Total): + avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) + min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) + max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG((SQC_DCACHE_ATOMIC / $denom)) + min: MIN((SQC_DCACHE_ATOMIC / $denom)) + max: MAX((SQC_DCACHE_ATOMIC / $denom)) + unit: (Req + $normUnit) + Read Req (1 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_1 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_1 / $denom)) + unit: (Req + $normUnit) + Read Req (2 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_2 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_2 / $denom)) + unit: (Req + $normUnit) + Read Req (4 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_4 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_4 / $denom)) + unit: (Req + $normUnit) + Read Req (8 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_8 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_8 / $denom)) + unit: (Req + $normUnit) + Read Req (16 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_16 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_16 / $denom)) + unit: (Req + $normUnit) + gfx942: + Req: + avg: AVG((SQC_DCACHE_REQ / $denom)) + min: MIN((SQC_DCACHE_REQ / $denom)) + max: MAX((SQC_DCACHE_REQ / $denom)) + unit: (Req + $normUnit) + Hits: + avg: AVG((SQC_DCACHE_HITS / $denom)) + min: MIN((SQC_DCACHE_HITS / $denom)) + max: MAX((SQC_DCACHE_HITS / $denom)) + unit: (Req + $normUnit) + Misses - Non Duplicated: + avg: AVG((SQC_DCACHE_MISSES / $denom)) + min: MIN((SQC_DCACHE_MISSES / $denom)) + max: MAX((SQC_DCACHE_MISSES / $denom)) + unit: (Req + $normUnit) + Misses- Duplicated: + avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom)) + min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom)) + max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom)) + unit: (Req + $normUnit) + Cache Hit Rate: + avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + unit: pct + Read Req (Total): + avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) + min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) + max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG((SQC_DCACHE_ATOMIC / $denom)) + min: MIN((SQC_DCACHE_ATOMIC / $denom)) + max: MAX((SQC_DCACHE_ATOMIC / $denom)) + unit: (Req + $normUnit) + Read Req (1 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_1 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_1 / $denom)) + unit: (Req + $normUnit) + Read Req (2 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_2 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_2 / $denom)) + unit: (Req + $normUnit) + Read Req (4 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_4 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_4 / $denom)) + unit: (Req + $normUnit) + Read Req (8 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_8 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_8 / $denom)) + unit: (Req + $normUnit) + Read Req (16 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_16 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_16 / $denom)) + unit: (Req + $normUnit) + gfx950: + Req: + avg: AVG((SQC_DCACHE_REQ / $denom)) + min: MIN((SQC_DCACHE_REQ / $denom)) + max: MAX((SQC_DCACHE_REQ / $denom)) + unit: (Req + $normUnit) + Hits: + avg: AVG((SQC_DCACHE_HITS / $denom)) + min: MIN((SQC_DCACHE_HITS / $denom)) + max: MAX((SQC_DCACHE_HITS / $denom)) + unit: (Req + $normUnit) + Misses - Non Duplicated: + avg: AVG((SQC_DCACHE_MISSES / $denom)) + min: MIN((SQC_DCACHE_MISSES / $denom)) + max: MAX((SQC_DCACHE_MISSES / $denom)) + unit: (Req + $normUnit) + Misses- Duplicated: + avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom)) + min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom)) + max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom)) + unit: (Req + $normUnit) + Cache Hit Rate: + avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + unit: pct + Read Req (Total): + avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) + min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) + max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG((SQC_DCACHE_ATOMIC / $denom)) + min: MIN((SQC_DCACHE_ATOMIC / $denom)) + max: MAX((SQC_DCACHE_ATOMIC / $denom)) + unit: (Req + $normUnit) + Read Req (1 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_1 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_1 / $denom)) + unit: (Req + $normUnit) + Read Req (2 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_2 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_2 / $denom)) + unit: (Req + $normUnit) + Read Req (4 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_4 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_4 / $denom)) + unit: (Req + $normUnit) + Read Req (8 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_8 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_8 / $denom)) + unit: (Req + $normUnit) + Read Req (16 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_16 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_16 / $denom)) + unit: (Req + $normUnit) + gfx908: + Req: + avg: AVG((SQC_DCACHE_REQ / $denom)) + min: MIN((SQC_DCACHE_REQ / $denom)) + max: MAX((SQC_DCACHE_REQ / $denom)) + unit: (Req + $normUnit) + Hits: + avg: AVG((SQC_DCACHE_HITS / $denom)) + min: MIN((SQC_DCACHE_HITS / $denom)) + max: MAX((SQC_DCACHE_HITS / $denom)) + unit: (Req + $normUnit) + Misses - Non Duplicated: + avg: AVG((SQC_DCACHE_MISSES / $denom)) + min: MIN((SQC_DCACHE_MISSES / $denom)) + max: MAX((SQC_DCACHE_MISSES / $denom)) + unit: (Req + $normUnit) + Misses- Duplicated: + avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom)) + min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom)) + max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom)) + unit: (Req + $normUnit) + Cache Hit Rate: + avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) + + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) + unit: pct + Read Req (Total): + avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) + min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) + max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) + + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG((SQC_DCACHE_ATOMIC / $denom)) + min: MIN((SQC_DCACHE_ATOMIC / $denom)) + max: MAX((SQC_DCACHE_ATOMIC / $denom)) + unit: (Req + $normUnit) + Read Req (1 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_1 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_1 / $denom)) + unit: (Req + $normUnit) + Read Req (2 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_2 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_2 / $denom)) + unit: (Req + $normUnit) + Read Req (4 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_4 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_4 / $denom)) + unit: (Req + $normUnit) + Read Req (8 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_8 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_8 / $denom)) + unit: (Req + $normUnit) + Read Req (16 DWord): + avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom)) + min: MIN((SQC_DCACHE_REQ_READ_16 / $denom)) + max: MAX((SQC_DCACHE_REQ_READ_16 / $denom)) + unit: (Req + $normUnit) + - metric_table: + id: 1403 + title: Scalar L1D Cache - L2 Interface + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + gfx90a: + sL1D-L2 BW: + avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 64)) / $denom)) + min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 64)) / $denom)) + max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 64)) / $denom)) + unit: (Bytes + $normUnit) + Read Req: + avg: AVG((SQC_TC_DATA_READ_REQ / $denom)) + min: MIN((SQC_TC_DATA_READ_REQ / $denom)) + max: MAX((SQC_TC_DATA_READ_REQ / $denom)) + unit: (Req + $normUnit) + Write Req: + avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom)) + min: MIN((SQC_TC_DATA_WRITE_REQ / $denom)) + max: MAX((SQC_TC_DATA_WRITE_REQ / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)) + min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom)) + max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom)) + unit: (Req + $normUnit) + Stall Cycles: + avg: AVG((SQC_TC_STALL / $denom)) + min: MIN((SQC_TC_STALL / $denom)) + max: MAX((SQC_TC_STALL / $denom)) + unit: (Cycles + $normUnit) + gfx941: + sL1D-L2 BW: + avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 64)) / $denom)) + min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 64)) / $denom)) + max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 64)) / $denom)) + unit: (Bytes + $normUnit) + Read Req: + avg: AVG((SQC_TC_DATA_READ_REQ / $denom)) + min: MIN((SQC_TC_DATA_READ_REQ / $denom)) + max: MAX((SQC_TC_DATA_READ_REQ / $denom)) + unit: (Req + $normUnit) + Write Req: + avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom)) + min: MIN((SQC_TC_DATA_WRITE_REQ / $denom)) + max: MAX((SQC_TC_DATA_WRITE_REQ / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)) + min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom)) + max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom)) + unit: (Req + $normUnit) + Stall Cycles: + avg: AVG((SQC_TC_STALL / $denom)) + min: MIN((SQC_TC_STALL / $denom)) + max: MAX((SQC_TC_STALL / $denom)) + unit: (Cycles + $normUnit) + gfx940: + sL1D-L2 BW: + avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 64)) / $denom)) + min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 64)) / $denom)) + max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 64)) / $denom)) + unit: (Bytes + $normUnit) + Read Req: + avg: AVG((SQC_TC_DATA_READ_REQ / $denom)) + min: MIN((SQC_TC_DATA_READ_REQ / $denom)) + max: MAX((SQC_TC_DATA_READ_REQ / $denom)) + unit: (Req + $normUnit) + Write Req: + avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom)) + min: MIN((SQC_TC_DATA_WRITE_REQ / $denom)) + max: MAX((SQC_TC_DATA_WRITE_REQ / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)) + min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom)) + max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom)) + unit: (Req + $normUnit) + Stall Cycles: + avg: AVG((SQC_TC_STALL / $denom)) + min: MIN((SQC_TC_STALL / $denom)) + max: MAX((SQC_TC_STALL / $denom)) + unit: (Cycles + $normUnit) + gfx942: + sL1D-L2 BW: + avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 64)) / $denom)) + min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 64)) / $denom)) + max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 64)) / $denom)) + unit: (Bytes + $normUnit) + Read Req: + avg: AVG((SQC_TC_DATA_READ_REQ / $denom)) + min: MIN((SQC_TC_DATA_READ_REQ / $denom)) + max: MAX((SQC_TC_DATA_READ_REQ / $denom)) + unit: (Req + $normUnit) + Write Req: + avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom)) + min: MIN((SQC_TC_DATA_WRITE_REQ / $denom)) + max: MAX((SQC_TC_DATA_WRITE_REQ / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)) + min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom)) + max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom)) + unit: (Req + $normUnit) + Stall Cycles: + avg: AVG((SQC_TC_STALL / $denom)) + min: MIN((SQC_TC_STALL / $denom)) + max: MAX((SQC_TC_STALL / $denom)) + unit: (Cycles + $normUnit) + gfx950: + sL1D-L2 BW: + avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 64)) / $denom)) + min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 64)) / $denom)) + max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 64)) / $denom)) + unit: (Bytes + $normUnit) + Read Req: + avg: AVG((SQC_TC_DATA_READ_REQ / $denom)) + min: MIN((SQC_TC_DATA_READ_REQ / $denom)) + max: MAX((SQC_TC_DATA_READ_REQ / $denom)) + unit: (Req + $normUnit) + Write Req: + avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom)) + min: MIN((SQC_TC_DATA_WRITE_REQ / $denom)) + max: MAX((SQC_TC_DATA_WRITE_REQ / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)) + min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom)) + max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom)) + unit: (Req + $normUnit) + Stall Cycles: + avg: AVG((SQC_TC_STALL / $denom)) + min: MIN((SQC_TC_STALL / $denom)) + max: MAX((SQC_TC_STALL / $denom)) + unit: (Cycles + $normUnit) + gfx908: + sL1D-L2 BW: + avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 64)) / $denom)) + min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 64)) / $denom)) + max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) + * 64)) / $denom)) + unit: (Bytes + $normUnit) + Read Req: + avg: AVG((SQC_TC_DATA_READ_REQ / $denom)) + min: MIN((SQC_TC_DATA_READ_REQ / $denom)) + max: MAX((SQC_TC_DATA_READ_REQ / $denom)) + unit: (Req + $normUnit) + Write Req: + avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom)) + min: MIN((SQC_TC_DATA_WRITE_REQ / $denom)) + max: MAX((SQC_TC_DATA_WRITE_REQ / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)) + min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom)) + max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom)) + unit: (Req + $normUnit) + Stall Cycles: + avg: AVG((SQC_TC_STALL / $denom)) + min: MIN((SQC_TC_STALL / $denom)) + max: MAX((SQC_TC_STALL / $denom)) + unit: (Cycles + $normUnit) + metrics_description: + Bandwidth: + plain: The number of bytes looked up in the sL1D cache, as a percent of the + peak theoretical bandwidth. Calculated as the ratio of sL1D requests over + the total sL1D cycles. + rst: The number of bytes looked up in the sL1D cache, as a percent of the peak theoretical + bandwidth. Calculated as the ratio of sL1D requests over the :ref:`total + sL1D cycles `. + unit: Percent + Cache Hit Rate: + plain: Indicates the percent of sL1D requests that hit on a previously loaded + line the cache. The ratio of the number of sL1D requests that hit over the + number of all sL1D requests. + rst: Indicates the percent of sL1D requests that hit on a previously loaded line + the cache. The ratio of the number of sL1D requests that hit [#sl1d-cache]_ + over the number of all sL1D requests. + unit: Percent + sL1D-L2 BW: + plain: "The total number of bytes read from, written to, or atomically updated\ + \ across the sL1D\u2194L2 interface, per normalization unit. Note that sL1D\ + \ writes and atomics are typically unused on current CDNA accelerators, so\ + \ in the majority of cases this can be interpreted as an sL1D\u2192L2 read\ + \ bandwidth." + rst: "The total number of bytes read from, written to, or atomically updated\ + \ across the sL1D\u2194:doc:`L2 ` interface, per :ref:`normalization\ + \ unit `. Note that sL1D writes and atomics are typically\ + \ unused on current CDNA accelerators, so in the majority of cases this can\ + \ be interpreted as an sL1D\u2192L2 read bandwidth." + unit: Bytes per normalization unit + Req: + plain: The total number of requests, of any size or type, made to the sL1D per + normalization unit. + rst: The total number of requests, of any size or type, made to the sL1D per :ref:`normalization + unit `. + unit: Requests per normalization unit + Hits: + plain: The total number of sL1D requests that hit on a previously loaded cache + line, per normalization unit. + rst: The total number of sL1D requests that hit on a previously loaded cache line, + per :ref:`normalization unit `. + unit: Requests per normalization unit + Misses - Non Duplicated: + plain: 'The total number of sL1D requests that missed on a cache line that was + not already pending due to another request, per normalization unit. ' + rst: The total number of sL1D requests that missed on a cache line that *was not* + already pending due to another request, per :ref:`normalization unit `. + See :ref:`desc-sl1d-sol` for more detail. + unit: Requests per normalization unit + Misses- Duplicated: + plain: The total number of sL1D requests that missed on a cache line that was + already pending due to another request, per normalization unit. + rst: The total number of sL1D requests that missed on a cache line that *was* already + pending due to another request, per :ref:`normalization unit `. + See :ref:`desc-sl1d-sol` for more detail. + unit: Requests per normalization unit + Read Req (Total): + plain: The total number of sL1D read requests of any size, per normalization + unit. + rst: The total number of sL1D read requests of any size, per :ref:`normalization + unit `. + unit: Requests per normalization unit + Atomic Req: + plain: The total number of atomic requests from sL1D to the L2, per normalization + unit. Typically unused on current CDNA accelerators. + rst: The total number of atomic requests from sL1D to the :doc:`L2 `, + per :ref:`normalization unit `. Typically unused on current + CDNA accelerators. + unit: Requests per normalization unit + Read Req (1 DWord): + plain: The total number of sL1D read requests made for a single dword of data + (4B), per normalization unit. + rst: The total number of sL1D read requests made for a single dword of data (4B), + per :ref:`normalization unit `. + unit: Requests per normalization unit + Read Req (2 DWord): + plain: The total number of sL1D read requests made for a two dwords of data + (8B), per normalization unit. + rst: The total number of sL1D read requests made for a two dwords of data (8B), + per :ref:`normalization unit `. + unit: Requests per normalization unit + Read Req (4 DWord): + plain: The total number of sL1D read requests made for a four dwords of data + (16B), per normalization unit. + rst: The total number of sL1D read requests made for a four dwords of data (16B), + per :ref:`normalization unit `. + unit: Requests per normalization unit + Read Req (8 DWord): + plain: The total number of sL1D read requests made for a eight dwords of data + (32B), per normalization unit. + rst: The total number of sL1D read requests made for a eight dwords of data (32B), + per :ref:`normalization unit `. + unit: Requests per normalization unit + Read Req (16 DWord): + plain: The total number of sL1D read requests made for a sixteen dwords of data + (64B), per normalization unit. + rst: The total number of sL1D read requests made for a sixteen dwords of data (64B), + per :ref:`normalization unit `. + unit: Requests per normalization unit + Read Req: + plain: The total number of read requests from sL1D to the L2 per normalization + unit. + rst: The total number of read requests from sL1D to the :doc:`L2 `, per + :ref:`normalization unit `. + unit: Requests per normalization unit + Write Req: + plain: The total number of write requests from sL1D to the L2, per normalization + unit. Typically unused on current CDNA accelerators. + rst: The total number of write requests from sL1D to the :doc:`L2 `, per + :ref:`normalization unit `. Typically unused on current + CDNA accelerators. + unit: Requests per normalization unit + Stall Cycles: + plain: "The total number of cycles the sL1D\u2194L2 interface was stalled, per\ + \ normalization unit." + rst: "The total number of cycles the sL1D\u2194 :doc:`L2 ` interface\ + \ was stalled, per :ref:`normalization unit `." + unit: Cycles per normalization unit +- id: 1500 + title: Address Processing Unit and Data Return Path (TA/TD) + data source: + - metric_table: + id: 1501 + title: Busy and stall metrics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + gfx90a: + Address Processing Unit Busy: + avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + Address Stall: + avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + unit: pct + Data Stall: + avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + unit: pct + "Data-Processor \u2192 Address Stall": + avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + unit: pct + "Sequencer \u2192 TA Address Stall": + avg: AVG((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) + min: MIN((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) + max: MAX((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) + "Sequencer \u2192 TA Command Stall": + avg: AVG((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) + min: MIN((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) + max: MAX((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) + "Sequencer \u2192 TA Data Stall": + avg: AVG((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) + min: MIN((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) + max: MAX((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) + gfx941: + Address Processing Unit Busy: + avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + Address Stall: + avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + unit: pct + Data Stall: + avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + unit: pct + "Data-Processor \u2192 Address Stall": + avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + unit: pct + "Sequencer \u2192 TA Address Stall": + avg: AVG((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) + min: MIN((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) + max: MAX((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) + "Sequencer \u2192 TA Command Stall": + avg: AVG((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) + min: MIN((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) + max: MAX((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) + "Sequencer \u2192 TA Data Stall": + avg: AVG((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) + min: MIN((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) + max: MAX((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) + gfx940: + Address Processing Unit Busy: + avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + Address Stall: + avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + unit: pct + Data Stall: + avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + unit: pct + "Data-Processor \u2192 Address Stall": + avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + unit: pct + "Sequencer \u2192 TA Address Stall": + avg: AVG((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) + min: MIN((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) + max: MAX((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) + "Sequencer \u2192 TA Command Stall": + avg: AVG((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) + min: MIN((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) + max: MAX((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) + "Sequencer \u2192 TA Data Stall": + avg: AVG((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) + min: MIN((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) + max: MAX((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) + gfx942: + Address Processing Unit Busy: + avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + Address Stall: + avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + unit: pct + Data Stall: + avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + unit: pct + "Data-Processor \u2192 Address Stall": + avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + unit: pct + "Sequencer \u2192 TA Address Stall": + avg: AVG((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) + min: MIN((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) + max: MAX((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) + "Sequencer \u2192 TA Command Stall": + avg: AVG((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) + min: MIN((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) + max: MAX((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) + "Sequencer \u2192 TA Data Stall": + avg: AVG((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) + min: MIN((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) + max: MAX((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) + gfx950: + Address Processing Unit Busy: + avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + Address Stall: + avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + unit: pct + Data Stall: + avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + unit: pct + "Data-Processor \u2192 Address Stall": + avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + unit: pct + "Sequencer \u2192 TA Address Stall": + avg: AVG((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) + min: MIN((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) + max: MAX((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) + "Sequencer \u2192 TA Command Stall": + avg: AVG((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) + min: MIN((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) + max: MAX((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) + "Sequencer \u2192 TA Data Stall": + avg: AVG((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) + min: MIN((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) + max: MAX((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) + gfx908: + Address Processing Unit Busy: + avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + Address Stall: + avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + unit: pct + Data Stall: + avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + unit: pct + "Data-Processor \u2192 Address Stall": + avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD + * $cu_per_gpu))) + unit: pct + - metric_table: + id: 1502 + title: Instruction counts + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + gfx90a: + Total Instructions: + avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom)) + min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom)) + max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Instructions: + avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Read Instructions: + avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Write Instructions: + avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Atomic Instructions: + avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Instructions: + avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Read Instructions: + avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Write Instructions: + avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Atomic Instructions: + avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + gfx941: + Total Instructions: + avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom)) + min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom)) + max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Instructions: + avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Read Instructions: + avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Write Instructions: + avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Atomic Instructions: + avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Instructions: + avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Read Instructions: + avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Write Instructions: + avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Atomic Instructions: + avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + gfx940: + Total Instructions: + avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom)) + min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom)) + max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Instructions: + avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Read Instructions: + avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Write Instructions: + avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Atomic Instructions: + avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Instructions: + avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Read Instructions: + avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Write Instructions: + avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Atomic Instructions: + avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + gfx942: + Total Instructions: + avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom)) + min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom)) + max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Instructions: + avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Read Instructions: + avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Write Instructions: + avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Atomic Instructions: + avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Instructions: + avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Read Instructions: + avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Write Instructions: + avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Atomic Instructions: + avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + gfx950: + Total Instructions: + avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom)) + min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom)) + max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Instructions: + avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Read Instructions: + avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Read Instructions for LDS: + avg: AVG((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Write Instructions: + avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Atomic Instructions: + avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Instructions: + avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Read Instructions: + avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Read Instructions for LDS: + avg: AVG((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Write Instructions: + avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Atomic Instructions: + avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + gfx908: + Total Instructions: + avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom)) + min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom)) + max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Instructions: + avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Read Instructions: + avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Write Instructions: + avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Global/Generic Atomic Instructions: + avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Instructions: + avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Read Instructions: + avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Write Instructions: + avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + Spill/Stack Atomic Instructions: + avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) + - metric_table: + id: 1503 + title: Spill and stack metrics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + gfx90a: + Spill/Stack Total Cycles: + avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) + min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) + max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) + unit: (Cycles + $normUnit) + Spill/Stack Coalesced Read: + avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) + min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) + max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) + unit: (Cycles + $normUnit) + Spill/Stack Coalesced Write: + avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) + min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) + max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) + unit: (Cycles + $normUnit) + gfx941: + Spill/Stack Total Cycles: + avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) + min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) + max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) + unit: (Cycles + $normUnit) + Spill/Stack Coalesced Read: + avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) + min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) + max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) + unit: (Cycles + $normUnit) + Spill/Stack Coalesced Write: + avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) + min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) + max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) + unit: (Cycles + $normUnit) + gfx940: + Spill/Stack Total Cycles: + avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) + min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) + max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) + unit: (Cycles + $normUnit) + Spill/Stack Coalesced Read: + avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) + min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) + max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) + unit: (Cycles + $normUnit) + Spill/Stack Coalesced Write: + avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) + min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) + max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) + unit: (Cycles + $normUnit) + gfx942: + Spill/Stack Total Cycles: + avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) + min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) + max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) + unit: (Cycles + $normUnit) + Spill/Stack Coalesced Read: + avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) + min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) + max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) + unit: (Cycles + $normUnit) + Spill/Stack Coalesced Write: + avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) + min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) + max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) + unit: (Cycles + $normUnit) + gfx950: + Spill/Stack Total Cycles: + avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) + min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) + max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) + unit: (Cycles + $normUnit) + Spill/Stack Coalesced Read: + avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) + min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) + max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) + unit: (Cycles + $normUnit) + Spill/Stack Coalesced Write: + avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) + min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) + max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) + unit: (Cycles + $normUnit) + gfx908: + Spill/Stack Total Cycles: + avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) + min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) + max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) + unit: (Cycles + $normUnit) + Spill/Stack Coalesced Read: + avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) + min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) + max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) + unit: (Cycles + $normUnit) + Spill/Stack Coalesced Write: + avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) + min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) + max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) + unit: (Cycles + $normUnit) + - metric_table: + id: 1504 + title: Vector L1 data-return path or Texture Data (TD) + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + gfx90a: + Data-Return Busy: + avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + "Cache RAM \u2192 Data-Return Stall": + avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + "Workgroup manager \u2192 Data-Return Stall": + avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + Coalescable Instructions: + avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom)) + min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom)) + max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) + Read Instructions: + avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) + / $denom)) + min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) + / $denom)) + max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) + / $denom)) + unit: (Instructions + $normUnit) + Write Instructions: + avg: AVG((TD_STORE_WAVEFRONT_sum / $denom)) + min: MIN((TD_STORE_WAVEFRONT_sum / $denom)) + max: MAX((TD_STORE_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) + Atomic Instructions: + avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom)) + min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom)) + max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) + gfx941: + Data-Return Busy: + avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + "Cache RAM \u2192 Data-Return Stall": + avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + "Workgroup manager \u2192 Data-Return Stall": + avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + Coalescable Instructions: + avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom)) + min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom)) + max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) + Read Instructions: + avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) + / $denom)) + min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) + / $denom)) + max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) + / $denom)) + unit: (Instructions + $normUnit) + Write Instructions: + avg: AVG((TD_STORE_WAVEFRONT_sum / $denom)) + min: MIN((TD_STORE_WAVEFRONT_sum / $denom)) + max: MAX((TD_STORE_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) + Atomic Instructions: + avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom)) + min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom)) + max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) + gfx940: + Data-Return Busy: + avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + "Cache RAM \u2192 Data-Return Stall": + avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + "Workgroup manager \u2192 Data-Return Stall": + avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + Coalescable Instructions: + avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom)) + min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom)) + max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) + Read Instructions: + avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) + / $denom)) + min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) + / $denom)) + max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) + / $denom)) + unit: (Instructions + $normUnit) + Write Instructions: + avg: AVG((TD_STORE_WAVEFRONT_sum / $denom)) + min: MIN((TD_STORE_WAVEFRONT_sum / $denom)) + max: MAX((TD_STORE_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) + Atomic Instructions: + avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom)) + min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom)) + max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) + gfx942: + Data-Return Busy: + avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + "Cache RAM \u2192 Data-Return Stall": + avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + "Workgroup manager \u2192 Data-Return Stall": + avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + Coalescable Instructions: + avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom)) + min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom)) + max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) + Read Instructions: + avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) + / $denom)) + min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) + / $denom)) + max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) + / $denom)) + unit: (Instructions + $normUnit) + Write Instructions: + avg: AVG((TD_STORE_WAVEFRONT_sum / $denom)) + min: MIN((TD_STORE_WAVEFRONT_sum / $denom)) + max: MAX((TD_STORE_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) + Atomic Instructions: + avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom)) + min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom)) + max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) + gfx950: + Data-Return Busy: + avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + "Cache RAM \u2192 Data-Return Stall": + avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + "Workgroup manager \u2192 Data-Return Stall": + avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + Coalescable Instructions: + avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom)) + min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom)) + max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) + Read Instructions: + avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) + / $denom)) + min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) + / $denom)) + max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) + / $denom)) + unit: (Instructions + $normUnit) + Write Instructions: + avg: AVG((TD_STORE_WAVEFRONT_sum / $denom)) + min: MIN((TD_STORE_WAVEFRONT_sum / $denom)) + max: MAX((TD_STORE_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) + Atomic Instructions: + avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom)) + min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom)) + max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) + Write Ack Instructions: + avg: AVG((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) + min: MIN((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) + max: MAX((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) + gfx908: + Data-Return Busy: + avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + "Cache RAM \u2192 Data-Return Stall": + avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + "Workgroup manager \u2192 Data-Return Stall": + avg: null + min: null + max: null + unit: pct + Coalescable Instructions: + avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom)) + min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom)) + max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) + Read Instructions: + avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) + / $denom)) + min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) + / $denom)) + max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) + / $denom)) + unit: (Instructions + $normUnit) + Write Instructions: + avg: AVG((TD_STORE_WAVEFRONT_sum / $denom)) + min: MIN((TD_STORE_WAVEFRONT_sum / $denom)) + max: MAX((TD_STORE_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) + Atomic Instructions: + avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom)) + min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom)) + max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) + metrics_description: + Address Processing Unit Busy: + plain: Percent of the total CU cycles the address processor was busy + rst: Percent of the :ref:`total CU cycles ` the address processor + was busy + unit: Percent + Address Stall: + plain: Percent of the total CU cycles the address processor was stalled from + sending address requests further into the vL1D pipeline. + rst: Percent of the :ref:`total CU cycles ` the address processor + was stalled from sending address requests further into the vL1D pipeline + unit: Percent + Data Stall: + plain: Percent of the total CU cycles the address processor was stalled from + sending write/atomic data further into the vL1D pipeline. + rst: Percent of the :ref:`total CU cycles ` the address processor + was stalled from sending write/atomic data further into the vL1D pipeline + unit: Percent + "Data-Processor \u2192 Address Stall": + plain: Percent of total CU cycles the address processor was stalled waiting + to send command data to the data processor. + rst: Percent of :ref:`total CU cycles ` the address processor was + stalled waiting to send command data to the :ref:`data processor ` + unit: Percent + Total Instructions: + plain: The total number of memory instructions executed by the address processer + over all compute units on the accelerator, per normalization unit. + rst: The total number of memory instructions executed by the address processer + over all compute units on the accelerator, per normalization unit. + unit: Instructions per normalization unit + Global/Generic Instructions: + plain: The total number of global & generic memory instructions executed on + all compute units on the accelerator, per normalization unit. + rst: The total number of global & generic memory instructions executed on all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + unit: Instructions per normalization unit + Global/Generic Read Instructions: + plain: The total number of global & generic memory read instructions executed + on all compute units on the accelerator, per normalization unit. + rst: The total number of global & generic memory read instructions executed + on all :doc:`compute units ` on the accelerator, per :ref:`normalization + unit `. + unit: Instructions per normalization unit + Global/Generic Write Instructions: + plain: The total number of global & generic memory write instructions executed + on all compute units on the accelerator, per normalization unit. + rst: The total number of global & generic memory write instructions executed on + all :doc:`compute units ` on the accelerator, per :ref:`normalization + unit `. + unit: Instructions per normalization unit + Global/Generic Atomic Instructions: + plain: The total number of global & generic memory atomic (with and without + return) instructions executed on all compute units on the accelerator, per + normalization unit. + rst: The total number of global & generic memory atomic (with and without return) + instructions executed on all :doc:`compute units ` on the accelerator, + per :ref:`normalization unit `. + unit: Instructions per normalization unit + Spill/Stack Instructions: + plain: The total number of spill/stack memory instructions executed on all compute + units on the accelerator, per normalization unit. + rst: The total number of spill/stack memory instructions executed on all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + unit: Instructions per normalization unit + Spill/Stack Read Instructions: + plain: The total number of spill/stack memory read instructions executed on + all compute units on the accelerator, per normalization unit. + rst: The total number of spill/stack memory read instructions executed on all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + unit: Instructions per normalization unit + Spill/Stack Write Instructions: + plain: The total number of spill/stack memory write instructions executed on + all compute units on the accelerator, per normalization unit. + rst: The total number of spill/stack memory write instructions executed on all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + unit: Instructions per normalization unit + Spill/Stack Atomic Instructions: + plain: The total number of spill/stack memory atomic (with and without return) + instructions executed on all compute units on the accelerator, per normalization + unit. Typically unused as these memory operations are typically used to implement + thread-local storage. + rst: The total number of spill/stack memory atomic (with and without return) instructions + executed on all :doc:`compute units ` on the accelerator, per + :ref:`normalization unit `. Typically unused as these + memory operations are typically used to implement thread-local storage. + unit: Instructions per normalization unit + Spill/Stack Total Cycles: + plain: The number of cycles the address processing unit spent working on spill/stack + instructions, per normalization unit. + rst: The number of cycles the address processing unit spent working on spill/stack + instructions, per :ref:`normalization unit `. + unit: Cycles per normalization unit + Spill/Stack Coalesced Read: + plain: The number of cycles the address processing unit spent working on coalesced + spill/stack read instructions, per normalization unit. + rst: The number of cycles the address processing unit spent working on coalesced + spill/stack read instructions, per :ref:`normalization unit `. + unit: Cycles per normalization unit + Spill/Stack Coalesced Write: + plain: The number of cycles the address processing unit spent working on coalesced + spill/stack write instructions, per normalization unit. + rst: The number of cycles the address processing unit spent working on coalesced + spill/stack write instructions, per :ref:`normalization unit `. + unit: Cycles per normalization unit + Data-Return Busy: + plain: Percent of the total CU cycles the data-return unit was busy processing + or waiting on data to return to the CU. + rst: Percent of the :ref:`total CU cycles ` the data-return unit + was busy processing or waiting on data to return to the :doc:`CU `. + unit: Percent + "Cache RAM \u2192 Data-Return Stall": + plain: Percent of the total CU cycles the data-return unit was stalled on data + to be returned from the vL1D Cache RAM. + rst: Percent of the :ref:`total CU cycles ` the data-return unit + was stalled on data to be returned from the :ref:`vL1D Cache RAM `. + unit: Percent + "Workgroup manager \u2192 Data-Return Stall": + plain: Percent of the total CU cycles the data-return unit was stalled by the + workgroup manager due to initialization of registers as a part of launching + new workgroups. + rst: Percent of the :ref:`total CU cycles ` the data-return unit + was stalled by the :ref:`workgroup manager ` due to initialization + of registers as a part of launching new workgroups. + unit: Percent + Coalescable Instructions: + plain: The number of instructions submitted to the data-return unit by the address + processor that were found to be coalescable, per normalization unit. + rst: The number of instructions submitted to the :ref:`data-return unit ` + by the :ref:`address processor ` that were found to be coalescable, + per :ref:`normalization unit `. + unit: Instructions per normalization unit + Read Instructions: + plain: The number of read instructions submitted to the data-return unit by + the address processor summed over all compute units on the accelerator, per + normalization unit. This is expected to be the sum of global/generic and spill/stack + reads in the address processor. + rst: The number of read instructions submitted to the :ref:`data-return unit + ` by the :ref:`address processor ` summed over all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + This is expected to be the sum of global/generic and spill/stack reads in + the :ref:`address processor `. + unit: Instructions per normalization unit + Write Instructions: + plain: The number of store instructions submitted to the data-return unit by + the address processor summed over all compute units on the accelerator, per + normalization unit. This is expected to be the sum of global/generic and spill/stack + stores in the address processor. + rst: The number of store instructions submitted to the :ref:`data-return unit + ` by the :ref:`address processor ` summed over all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + This is expected to be the sum of global/generic and spill/stack stores counted + by the :ref:`vL1D cache-front-end `. + unit: Instructions per normalization unit + Atomic Instructions: + plain: The number of atomic instructions submitted to the data-return unit by + the address processor summed over all compute units on the accelerator, per + normalization unit. This is expected to be the sum of global/generic and spill/stack + atomics in the address processor. + rst: The number of atomic instructions submitted to the :ref:`data-return unit + ` by the :ref:`address processor ` summed over all :doc:`compute + units ` on the accelerator, per :ref:`normalization unit `. + This is expected to be the sum of global/generic and spill/stack atomics + in the :ref:`address processor `. + unit: Instructions per normalization unit +- id: 1600 + title: Vector L1 Data Cache + data source: + - metric_table: + id: 1601 + title: vL1D Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + metric: + gfx90a: + Hit rate: + value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None)) + unit: Pct of Peak + Bandwidth: + value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk / 1000) * 64) * $cu_per_gpu)) + unit: Pct of Peak + Utilization: + value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None)) + unit: Pct of Peak + Coalescing: + value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum + * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None)) + unit: Pct of Peak + gfx941: + Hit rate: + value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None)) + unit: Pct of Peak + Bandwidth: + value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) + unit: Pct of Peak + Utilization: + value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None)) + unit: Pct of Peak + Coalescing: + value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum + * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None)) + unit: Pct of Peak + gfx940: + Hit rate: + value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None)) + unit: Pct of Peak + Bandwidth: + value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) + unit: Pct of Peak + Utilization: + value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None)) + unit: Pct of Peak + Coalescing: + value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum + * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None)) + unit: Pct of Peak + gfx942: + Hit rate: + value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None)) + unit: Pct of Peak + Bandwidth: + value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) + unit: Pct of Peak + Utilization: + value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None)) + unit: Pct of Peak + Coalescing: + value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum + * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None)) + unit: Pct of Peak + gfx950: + Hit rate: + value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None)) + unit: Pct of Peak + Bandwidth: + value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) + unit: Pct of Peak + Utilization: + value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None)) + unit: Pct of Peak + Coalescing: + value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum + * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None)) + unit: Pct of Peak + gfx908: + Hit rate: + value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None)) + unit: Pct of Peak + Bandwidth: + value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp + - Start_Timestamp)))) / ((($max_sclk / 1000) * 64) * $cu_per_gpu)) + unit: Pct of Peak + Utilization: + value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None)) + unit: Pct of Peak + Coalescing: + value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum + * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None)) + unit: Pct of Peak + comparable: false + cli_style: simple_bar + tui_style: simple_bar + - metric_table: + id: 1602 + title: vL1D cache stall metrics + header: + metric: Metric + expr: Expression + metric: + gfx90a: + Stalled on L2 Data: + expr: (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None) + Stalled on L2 Req: + expr: (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None) + Tag RAM Stall (Read): + expr: (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + Tag RAM Stall (Write): + expr: (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + Tag RAM Stall (Atomic): + expr: (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + gfx941: + Stalled on L2 Data: + expr: (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None) + Stalled on L2 Req: + expr: (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None) + Tag RAM Stall (Read): + expr: (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + Tag RAM Stall (Write): + expr: (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + Tag RAM Stall (Atomic): + expr: (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + gfx940: + Stalled on L2 Data: + expr: (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None) + Stalled on L2 Req: + expr: (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None) + Tag RAM Stall (Read): + expr: (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + Tag RAM Stall (Write): + expr: (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + Tag RAM Stall (Atomic): + expr: (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + gfx942: + Stalled on L2 Data: + expr: (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None) + Stalled on L2 Req: + expr: (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None) + Tag RAM Stall (Read): + expr: (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + Tag RAM Stall (Write): + expr: (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + Tag RAM Stall (Atomic): + expr: (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + gfx950: + Stalled on L2 Data: + expr: (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None) + Stalled on L2 Req: + expr: (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None) + Stalled on Address: + expr: (((100 * TCP_TCP_TA_ADDR_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if + (TCP_GATE_EN1_sum != 0) else None) + Stalled on Data: + expr: (((100 * TCP_TCP_TA_DATA_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if + (TCP_GATE_EN1_sum != 0) else None) + Stalled on Latency FIFO: + expr: (((100 * TCP_LFIFO_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None) + Stalled on Request FIFO: + expr: (((100 * TCP_RFIFO_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None) + Stalled on Read Return: + expr: (((100 * TCP_TCR_RDRET_STALL_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None) + Tag RAM Stall (Read): + expr: (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + Tag RAM Stall (Write): + expr: (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + Tag RAM Stall (Atomic): + expr: (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + gfx908: + Stalled on L2 Data: + expr: (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None) + Stalled on L2 Req: + expr: (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum + != 0) else None) + Tag RAM Stall (Read): + expr: (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + Tag RAM Stall (Write): + expr: (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + Tag RAM Stall (Atomic): + expr: (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) + if (TCP_GATE_EN1_sum != 0) else None) + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1603 + title: vL1D cache access metrics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + gfx90a: + Total Req: + avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom)) + min: MIN((TCP_TOTAL_ACCESSES_sum / $denom)) + max: MAX((TCP_TOTAL_ACCESSES_sum / $denom)) + unit: (Req + $normUnit) + Read Req: + avg: AVG((TCP_TOTAL_READ_sum / $denom)) + min: MIN((TCP_TOTAL_READ_sum / $denom)) + max: MAX((TCP_TOTAL_READ_sum / $denom)) + unit: (Req + $normUnit) + Write Req: + avg: AVG((TCP_TOTAL_WRITE_sum / $denom)) + min: MIN((TCP_TOTAL_WRITE_sum / $denom)) + max: MAX((TCP_TOTAL_WRITE_sum / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + unit: (Req + $normUnit) + Cache BW: + avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / $denom)) + min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / $denom)) + max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / $denom)) + unit: (Bytes + $normUnit) + Cache Hit Rate: + avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None)) + min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None)) + max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None)) + unit: pct + Cache Accesses: + avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + unit: (Req + $normUnit) + Cache Hits: + avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + unit: (Req + $normUnit) + Invalidations: + avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + unit: (Req + $normUnit) + L1-L2 BW: + avg: AVG(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) + min: MIN(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) + max: MAX(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) + unit: (Bytes + $normUnit) + L1-L2 Read: + avg: AVG((TCP_TCC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + L1-L2 Write: + avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + L1-L2 Atomic: + avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + unit: (Req + $normUnit) + L1 Access Latency: + avg: AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum + != 0) else None)) + min: MIN(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum + != 0) else None)) + max: MAX(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum + != 0) else None)) + unit: Cycles + L1-L2 Read Latency: + avg: AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) + if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else + None)) + min: MIN(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) + if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else + None)) + max: MAX(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) + if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else + None)) + unit: Cycles + L1-L2 Write Latency: + avg: AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != + 0) else None)) + min: MIN(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != + 0) else None)) + max: MAX(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != + 0) else None)) + unit: Cycles + gfx941: + Total Req: + avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom)) + min: MIN((TCP_TOTAL_ACCESSES_sum / $denom)) + max: MAX((TCP_TOTAL_ACCESSES_sum / $denom)) + unit: (Req + $normUnit) + Read Req: + avg: AVG((TCP_TOTAL_READ_sum / $denom)) + min: MIN((TCP_TOTAL_READ_sum / $denom)) + max: MAX((TCP_TOTAL_READ_sum / $denom)) + unit: (Req + $normUnit) + Write Req: + avg: AVG((TCP_TOTAL_WRITE_sum / $denom)) + min: MIN((TCP_TOTAL_WRITE_sum / $denom)) + max: MAX((TCP_TOTAL_WRITE_sum / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + unit: (Req + $normUnit) + Cache BW: + avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / $denom)) + min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / $denom)) + max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / $denom)) + unit: (Bytes + $normUnit) + Cache Hit Rate: + avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None)) + min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None)) + max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None)) + unit: pct + Cache Accesses: + avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + unit: (Req + $normUnit) + Cache Hits: + avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + unit: (Req + $normUnit) + Invalidations: + avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + unit: (Req + $normUnit) + L1-L2 BW: + avg: AVG(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + min: MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + max: MAX(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + unit: (Bytes + $normUnit) + L1-L2 Read: + avg: AVG((TCP_TCC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + L1-L2 Write: + avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + L1-L2 Atomic: + avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + unit: (Req + $normUnit) + gfx940: + Total Req: + avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom)) + min: MIN((TCP_TOTAL_ACCESSES_sum / $denom)) + max: MAX((TCP_TOTAL_ACCESSES_sum / $denom)) + unit: (Req + $normUnit) + Read Req: + avg: AVG((TCP_TOTAL_READ_sum / $denom)) + min: MIN((TCP_TOTAL_READ_sum / $denom)) + max: MAX((TCP_TOTAL_READ_sum / $denom)) + unit: (Req + $normUnit) + Write Req: + avg: AVG((TCP_TOTAL_WRITE_sum / $denom)) + min: MIN((TCP_TOTAL_WRITE_sum / $denom)) + max: MAX((TCP_TOTAL_WRITE_sum / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + unit: (Req + $normUnit) + Cache BW: + avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / $denom)) + min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / $denom)) + max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / $denom)) + unit: (Bytes + $normUnit) + Cache Hit Rate: + avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None)) + min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None)) + max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None)) + unit: pct + Cache Accesses: + avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + unit: (Req + $normUnit) + Cache Hits: + avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + unit: (Req + $normUnit) + Invalidations: + avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + unit: (Req + $normUnit) + L1-L2 BW: + avg: AVG(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + min: MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + max: MAX(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + unit: (Bytes + $normUnit) + L1-L2 Read: + avg: AVG((TCP_TCC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + L1-L2 Write: + avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + L1-L2 Atomic: + avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + unit: (Req + $normUnit) + gfx942: + Total Req: + avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom)) + min: MIN((TCP_TOTAL_ACCESSES_sum / $denom)) + max: MAX((TCP_TOTAL_ACCESSES_sum / $denom)) + unit: (Req + $normUnit) + Read Req: + avg: AVG((TCP_TOTAL_READ_sum / $denom)) + min: MIN((TCP_TOTAL_READ_sum / $denom)) + max: MAX((TCP_TOTAL_READ_sum / $denom)) + unit: (Req + $normUnit) + Write Req: + avg: AVG((TCP_TOTAL_WRITE_sum / $denom)) + min: MIN((TCP_TOTAL_WRITE_sum / $denom)) + max: MAX((TCP_TOTAL_WRITE_sum / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + unit: (Req + $normUnit) + Cache BW: + avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / $denom)) + min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / $denom)) + max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / $denom)) + unit: (Bytes + $normUnit) + Cache Hit Rate: + avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None)) + min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None)) + max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None)) + unit: pct + Cache Accesses: + avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + unit: (Req + $normUnit) + Cache Hits: + avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + unit: (Req + $normUnit) + Invalidations: + avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + unit: (Req + $normUnit) + L1-L2 BW: + avg: AVG(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + min: MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + max: MAX(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + unit: (Bytes + $normUnit) + L1-L2 Read: + avg: AVG((TCP_TCC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + L1-L2 Write: + avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + L1-L2 Atomic: + avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + unit: (Req + $normUnit) + gfx950: + Total Req: + avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom)) + min: MIN((TCP_TOTAL_ACCESSES_sum / $denom)) + max: MAX((TCP_TOTAL_ACCESSES_sum / $denom)) + unit: (Req + $normUnit) + Read Req: + avg: AVG((TCP_TOTAL_READ_sum / $denom)) + min: MIN((TCP_TOTAL_READ_sum / $denom)) + max: MAX((TCP_TOTAL_READ_sum / $denom)) + unit: (Req + $normUnit) + Write Req: + avg: AVG((TCP_TOTAL_WRITE_sum / $denom)) + min: MIN((TCP_TOTAL_WRITE_sum / $denom)) + max: MAX((TCP_TOTAL_WRITE_sum / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + unit: (Req + $normUnit) + Cache BW: + avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / $denom)) + min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / $denom)) + max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / $denom)) + unit: (Bytes + $normUnit) + Cache Hit Rate: + avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None)) + min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None)) + max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None)) + unit: pct + Cache Accesses: + avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + unit: (Req + $normUnit) + Cache Hits: + avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + unit: (Req + $normUnit) + Invalidations: + avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + unit: (Req + $normUnit) + L1-L2 BW: + avg: AVG(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + min: MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + max: MAX(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + unit: (Bytes + $normUnit) + Tag RAM 0 Req: + avg: AVG((TCP_TAGRAM0_REQ_sum / $denom)) + min: MIN((TCP_TAGRAM0_REQ_sum / $denom)) + max: MAX((TCP_TAGRAM0_REQ_sum / $denom)) + unit: (Req + $normUnit) + Tag RAM 1 Req: + avg: AVG((TCP_TAGRAM1_REQ_sum / $denom)) + min: MIN((TCP_TAGRAM1_REQ_sum / $denom)) + max: MAX((TCP_TAGRAM1_REQ_sum / $denom)) + unit: (Req + $normUnit) + Tag RAM 2 Req: + avg: AVG((TCP_TAGRAM2_REQ_sum / $denom)) + min: MIN((TCP_TAGRAM2_REQ_sum / $denom)) + max: MAX((TCP_TAGRAM2_REQ_sum / $denom)) + unit: (Req + $normUnit) + Tag RAM 3 Req: + avg: AVG((TCP_TAGRAM3_REQ_sum / $denom)) + min: MIN((TCP_TAGRAM3_REQ_sum / $denom)) + max: MAX((TCP_TAGRAM3_REQ_sum / $denom)) + unit: (Req + $normUnit) + L1-L2 Read: + avg: AVG((TCP_TCC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + L1-L2 Write: + avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + L1-L2 Atomic: + avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + unit: (Req + $normUnit) + L1 Access Latency: + avg: AVG((TCP_TCP_LATENCY_sum / $denom)) + min: MIN((TCP_TCP_LATENCY_sum / $denom)) + max: MAX((TCP_TCP_LATENCY_sum / $denom)) + unit: (Cycles + $normUnit) + L1-L2 Read Latency: + avg: AVG((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) + min: MIN((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) + max: MAX((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) + unit: (Cycles + $normUnit) + L1-L2 Write Latency: + avg: AVG((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) + min: MIN((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) + max: MAX((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) + unit: (Cycles + $normUnit) + gfx908: + Total Req: + avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom)) + min: MIN((TCP_TOTAL_ACCESSES_sum / $denom)) + max: MAX((TCP_TOTAL_ACCESSES_sum / $denom)) + unit: (Req + $normUnit) + Read Req: + avg: AVG((TCP_TOTAL_READ_sum / $denom)) + min: MIN((TCP_TOTAL_READ_sum / $denom)) + max: MAX((TCP_TOTAL_READ_sum / $denom)) + unit: (Req + $normUnit) + Write Req: + avg: AVG((TCP_TOTAL_WRITE_sum / $denom)) + min: MIN((TCP_TOTAL_WRITE_sum / $denom)) + max: MAX((TCP_TOTAL_WRITE_sum / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) + / $denom)) + unit: (Req + $normUnit) + Cache BW: + avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / $denom)) + min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / $denom)) + max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / $denom)) + unit: (Bytes + $normUnit) + Cache Hit Rate: + avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None)) + min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None)) + max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != + 0) else None)) + unit: pct + Cache Accesses: + avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) + unit: (Req + $normUnit) + Cache Hits: + avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + / $denom)) + unit: (Req + $normUnit) + Invalidations: + avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) + unit: (Req + $normUnit) + L1-L2 BW: + avg: AVG(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) + min: MIN(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) + max: MAX(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / $denom)) + unit: (Bytes + $normUnit) + L1-L2 Read: + avg: AVG((TCP_TCC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + L1-L2 Write: + avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + L1-L2 Atomic: + avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) + / $denom)) + unit: (Req + $normUnit) + L1 Access Latency: + avg: AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum + != 0) else None)) + min: MIN(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum + != 0) else None)) + max: MAX(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum + != 0) else None)) + unit: Cycles + L1-L2 Read Latency: + avg: AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) + if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else + None)) + min: MIN(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) + if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else + None)) + max: MAX(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) + if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else + None)) + unit: Cycles + L1-L2 Write Latency: + avg: AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != + 0) else None)) + min: MIN(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != + 0) else None)) + max: MAX(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) + if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != + 0) else None)) + unit: Cycles + - metric_table: + id: 1604 + title: L1D - L2 Transactions + header: + metric: Metric + xfer: Xfer + coherency: Coherency + avg: Avg + min: Min + max: Max + unit: Unit + metric: + gfx90a: + NC - Read: + xfer: Read + coherency: NC + avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC - Read: + xfer: Read + coherency: UC + avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC - Read: + xfer: Read + coherency: CC + avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW - Read: + xfer: Read + coherency: RW + avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW - Write: + xfer: Write + coherency: RW + avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + NC - Write: + xfer: Write + coherency: NC + avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC - Write: + xfer: Write + coherency: UC + avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC - Write: + xfer: Write + coherency: CC + avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + NC - Atomic: + xfer: Atomic + coherency: NC + avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC - Atomic: + xfer: Atomic + coherency: UC + avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC - Atomic: + xfer: Atomic + coherency: CC + avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW - Atomic: + xfer: Atomic + coherency: RW + avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + gfx941: + NC - Read: + xfer: Read + coherency: NC + avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC - Read: + xfer: Read + coherency: UC + avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC - Read: + xfer: Read + coherency: CC + avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW - Read: + xfer: Read + coherency: RW + avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW - Write: + xfer: Write + coherency: RW + avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + NC - Write: + xfer: Write + coherency: NC + avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC - Write: + xfer: Write + coherency: UC + avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC - Write: + xfer: Write + coherency: CC + avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + NC - Atomic: + xfer: Atomic + coherency: NC + avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC - Atomic: + xfer: Atomic + coherency: UC + avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC - Atomic: + xfer: Atomic + coherency: CC + avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW - Atomic: + xfer: Atomic + coherency: RW + avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + gfx940: + NC - Read: + xfer: Read + coherency: NC + avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC - Read: + xfer: Read + coherency: UC + avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC - Read: + xfer: Read + coherency: CC + avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW - Read: + xfer: Read + coherency: RW + avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW - Write: + xfer: Write + coherency: RW + avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + NC - Write: + xfer: Write + coherency: NC + avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC - Write: + xfer: Write + coherency: UC + avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC - Write: + xfer: Write + coherency: CC + avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + NC - Atomic: + xfer: Atomic + coherency: NC + avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC - Atomic: + xfer: Atomic + coherency: UC + avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC - Atomic: + xfer: Atomic + coherency: CC + avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW - Atomic: + xfer: Atomic + coherency: RW + avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + gfx942: + NC - Read: + xfer: Read + coherency: NC + avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC - Read: + xfer: Read + coherency: UC + avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC - Read: + xfer: Read + coherency: CC + avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW - Read: + xfer: Read + coherency: RW + avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW - Write: + xfer: Write + coherency: RW + avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + NC - Write: + xfer: Write + coherency: NC + avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC - Write: + xfer: Write + coherency: UC + avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC - Write: + xfer: Write + coherency: CC + avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + NC - Atomic: + xfer: Atomic + coherency: NC + avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC - Atomic: + xfer: Atomic + coherency: UC + avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC - Atomic: + xfer: Atomic + coherency: CC + avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW - Atomic: + xfer: Atomic + coherency: RW + avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + gfx950: + NC - Read: + xfer: Read + coherency: NC + avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC - Read: + xfer: Read + coherency: UC + avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC - Read: + xfer: Read + coherency: CC + avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW - Read: + xfer: Read + coherency: RW + avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW - Write: + xfer: Write + coherency: RW + avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + NC - Write: + xfer: Write + coherency: NC + avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC - Write: + xfer: Write + coherency: UC + avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC - Write: + xfer: Write + coherency: CC + avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + NC - Atomic: + xfer: Atomic + coherency: NC + avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC - Atomic: + xfer: Atomic + coherency: UC + avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC - Atomic: + xfer: Atomic + coherency: CC + avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW - Atomic: + xfer: Atomic + coherency: RW + avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + gfx908: + NC - Read: + xfer: Read + coherency: NC + avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC - Read: + xfer: Read + coherency: UC + avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC - Read: + xfer: Read + coherency: CC + avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW - Read: + xfer: Read + coherency: RW + avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW - Write: + xfer: Write + coherency: RW + avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + NC - Write: + xfer: Write + coherency: NC + avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC - Write: + xfer: Write + coherency: UC + avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC - Write: + xfer: Write + coherency: CC + avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom)) + unit: (Req + $normUnit) + NC - Atomic: + xfer: Atomic + coherency: NC + avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC - Atomic: + xfer: Atomic + coherency: UC + avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC - Atomic: + xfer: Atomic + coherency: CC + avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW - Atomic: + xfer: Atomic + coherency: RW + avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) + unit: (Req + $normUnit) + - metric_table: + id: 1605 + title: L1 Unified Translation Cache (UTCL1) + header: + metric: Metric + avg: Avg + min: Min + max: Max + units: Units + metric: + gfx90a: + Req: + avg: AVG((TCP_UTCL1_REQUEST_sum / $denom)) + min: MIN((TCP_UTCL1_REQUEST_sum / $denom)) + max: MAX((TCP_UTCL1_REQUEST_sum / $denom)) + units: (Req + $normUnit) + Hit Ratio: + avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) + if (TCP_UTCL1_REQUEST_sum != 0) else None)) + min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) + if (TCP_UTCL1_REQUEST_sum != 0) else None)) + max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) + if (TCP_UTCL1_REQUEST_sum != 0) else None)) + units: pct + Hits: + avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + units: (Req + $normUnit) + Translation Misses: + avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + units: (Req + $normUnit) + Permission Misses: + avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + units: (Req + $normUnit) + gfx941: + Req: + avg: AVG((TCP_UTCL1_REQUEST_sum / $denom)) + min: MIN((TCP_UTCL1_REQUEST_sum / $denom)) + max: MAX((TCP_UTCL1_REQUEST_sum / $denom)) + units: (Req + $normUnit) + Hit Ratio: + avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) + if (TCP_UTCL1_REQUEST_sum != 0) else None)) + min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) + if (TCP_UTCL1_REQUEST_sum != 0) else None)) + max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) + if (TCP_UTCL1_REQUEST_sum != 0) else None)) + units: pct + Hits: + avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + units: (Req + $normUnit) + Translation Misses: + avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + units: (Req + $normUnit) + Permission Misses: + avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + units: (Req + $normUnit) + gfx940: + Req: + avg: AVG((TCP_UTCL1_REQUEST_sum / $denom)) + min: MIN((TCP_UTCL1_REQUEST_sum / $denom)) + max: MAX((TCP_UTCL1_REQUEST_sum / $denom)) + units: (Req + $normUnit) + Hit Ratio: + avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) + if (TCP_UTCL1_REQUEST_sum != 0) else None)) + min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) + if (TCP_UTCL1_REQUEST_sum != 0) else None)) + max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) + if (TCP_UTCL1_REQUEST_sum != 0) else None)) + units: pct + Hits: + avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + units: (Req + $normUnit) + Translation Misses: + avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + units: (Req + $normUnit) + Permission Misses: + avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + units: (Req + $normUnit) + gfx942: + Req: + avg: AVG((TCP_UTCL1_REQUEST_sum / $denom)) + min: MIN((TCP_UTCL1_REQUEST_sum / $denom)) + max: MAX((TCP_UTCL1_REQUEST_sum / $denom)) + units: (Req + $normUnit) + Hit Ratio: + avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) + if (TCP_UTCL1_REQUEST_sum != 0) else None)) + min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) + if (TCP_UTCL1_REQUEST_sum != 0) else None)) + max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) + if (TCP_UTCL1_REQUEST_sum != 0) else None)) + units: pct + Hits: + avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + units: (Req + $normUnit) + Translation Misses: + avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + units: (Req + $normUnit) + Permission Misses: + avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + units: (Req + $normUnit) + gfx950: + Req: + avg: AVG((TCP_UTCL1_REQUEST_sum / $denom)) + min: MIN((TCP_UTCL1_REQUEST_sum / $denom)) + max: MAX((TCP_UTCL1_REQUEST_sum / $denom)) + units: (Req + $normUnit) + Inflight Req: + avg: AVG((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom)) + min: MIN((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom)) + max: MAX((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom)) + units: (Req + $normUnit) + Hit Ratio: + avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) + if (TCP_UTCL1_REQUEST_sum != 0) else None)) + min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) + if (TCP_UTCL1_REQUEST_sum != 0) else None)) + max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) + if (TCP_UTCL1_REQUEST_sum != 0) else None)) + units: pct + Hits: + avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + units: (Req + $normUnit) + Translation Misses: + avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + units: (Req + $normUnit) + Misses under Translation Miss: + avg: AVG((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom)) + min: MIN((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom)) + max: MAX((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom)) + units: (Req + $normUnit) + Permission Misses: + avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + units: (Req + $normUnit) + gfx908: + Req: + avg: AVG((TCP_UTCL1_REQUEST_sum / $denom)) + min: MIN((TCP_UTCL1_REQUEST_sum / $denom)) + max: MAX((TCP_UTCL1_REQUEST_sum / $denom)) + units: (Req + $normUnit) + Hit Ratio: + avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) + if (TCP_UTCL1_REQUEST_sum != 0) else None)) + min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) + if (TCP_UTCL1_REQUEST_sum != 0) else None)) + max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) + if (TCP_UTCL1_REQUEST_sum != 0) else None)) + units: pct + Hits: + avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) + units: (Req + $normUnit) + Translation Misses: + avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) + units: (Req + $normUnit) + Permission Misses: + avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) + units: (Req + $normUnit) + - metric_table: + id: 1606 + title: L1D Addr Translation Stalls + header: + metric: Metric + avg: Avg + min: Min + max: Max + units: Units + metric: + gfx90a: {} + gfx941: {} + gfx940: {} + gfx942: {} + gfx950: + Cache Full Stall: + avg: AVG((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom)) + min: MIN((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom)) + max: MAX((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom)) + units: (Cycles + $normUnit) + Cache Miss Stall: + avg: AVG((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom)) + min: MIN((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom)) + max: MAX((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom)) + units: (Cycles + $normUnit) + Serialization Stall: + avg: AVG((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom)) + min: MIN((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom)) + max: MAX((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom)) + units: (Cycles + $normUnit) + Thrashing Stall: + avg: AVG((TCP_UTCL1_THRASHING_STALL_sum / $denom)) + min: MIN((TCP_UTCL1_THRASHING_STALL_sum / $denom)) + max: MAX((TCP_UTCL1_THRASHING_STALL_sum / $denom)) + units: (Cycles + $normUnit) + Latency FIFO Stall: + avg: AVG((TCP_UTCL1_LFIFO_FULL_sum / $denom)) + min: MIN((TCP_UTCL1_LFIFO_FULL_sum / $denom)) + max: MAX((TCP_UTCL1_LFIFO_FULL_sum / $denom)) + units: (Cycles + $normUnit) + Resident Page Full Stall: + avg: AVG((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom)) + min: MIN((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom)) + max: MAX((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom)) + units: (Cycles + $normUnit) + UTCL2 Stall: + avg: AVG((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom)) + min: MIN((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom)) + max: MAX((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom)) + units: (Cycles + $normUnit) + gfx908: {} + metrics_description: + Hit rate: + plain: The ratio of the number of vL1D cache line requests that hit in vL1D + cache over the total number of cache line requests to the vL1D Cache RAM. + rst: The ratio of the number of vL1D cache line requests that hit [#vl1d-hit]_ in + vL1D cache over the total number of cache line requests to the :ref:`vL1D + Cache RAM `. + unit: Percent + Bandwidth: + plain: The number of bytes looked up in the vL1D cache as a result of VMEM instructions, + as a percent of the peak theoretical bandwidth achievable on the specific + accelerator. The number of bytes is calculated as the number of cache lines + requested multiplied by the cache line size. This value does not consider + partial requests, so for instance, if only a single value is requested in + a cache line, the data movement will still be counted as a full cache line. + rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM + ` instructions, as a percent of the peak theoretical bandwidth + achievable on the specific accelerator. The number of bytes is calculated + as the number of cache lines requested multiplied by the cache line size. + This value does not consider partial requests, so for instance, if only a + single value is requested in a cache line, the data movement will still be + counted as a full cache line. + unit: Percent + Utilization: + plain: Indicates how busy the vL1D Cache RAM was during the kernel execution. + The number of cycles where the vL1D Cache RAM is actively processing any request + divided by the number of cycles where the vL1D is active. + rst: Indicates how busy the :ref:`vL1D Cache RAM ` was during the kernel + execution. The number of cycles where the vL1D Cache RAM is actively processing + any request divided by the number of cycles where the vL1D is active [#vl1d-activity]_. + unit: Percent + Coalescing: + plain: Indicates how well memory instructions were coalesced by the address + processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). + Calculated as the average number of thread-requests generated per instruction + divided by the ideal number of thread-requests per instruction. + rst: Indicates how well memory instructions were coalesced by the :ref:`address + processing unit `, ranging from uncoalesced (25%) to fully coalesced + (100%). Calculated as the average number of :ref:`thread-requests ` + generated per instruction divided by the ideal number of thread-requests + per instruction. + unit: Percent + Stalled on L2 Data: + plain: The ratio of the number of cycles where the vL1D is stalled waiting for + requested data to return from the L2 cache divided by the number of cycles + where the vL1D is active. + rst: The ratio of the number of cycles where the vL1D is stalled waiting for requested + data to return from the :doc:`L2 cache ` divided by the number + of cycles where the vL1D is active [#vl1d-activity]_. + unit: Percent + Stalled on L2 Req: + plain: The ratio of the number of cycles where the vL1D is stalled waiting to + issue a request for data to the L2 cache divided by the number of cycles where + the vL1D is active. + rst: The ratio of the number of cycles where the vL1D is stalled waiting to issue + a request for data to the :doc:`L2 cache ` divided by the number + of cycles where the vL1D is active [#vl1d-activity]_. + unit: Percent + Tag RAM Stall (Read): + plain: The ratio of the number of cycles where the vL1D is stalled due to Read + requests with conflicting tags being looked up concurrently, divided by the + number of cycles where the vL1D is active. + rst: The ratio of the number of cycles where the vL1D is stalled due to Read + requests with conflicting tags being looked up concurrently, divided by the + number of cycles where the vL1D is active [#vl1d-activity]_. + unit: Percent + Tag RAM Stall (Write): + plain: The ratio of the number of cycles where the vL1D is stalled due to Write + requests with conflicting tags being looked up concurrently, divided by the + number of cycles where the vL1D is active. + rst: The ratio of the number of cycles where the vL1D is stalled due to Write + requests with conflicting tags being looked up concurrently, divided by the + number of cycles where the vL1D is active [#vl1d-activity]_. + unit: Percent + Tag RAM Stall (Atomic): + plain: The ratio of the number of cycles where the vL1D is stalled due to Atomic + requests with conflicting tags being looked up concurrently, divided by the + number of cycles where the vL1D is active. + rst: The ratio of the number of cycles where the vL1D is stalled due to Atomic + requests with conflicting tags being looked up concurrently, divided by the + number of cycles where the vL1D is active [#vl1d-activity]_. + unit: Percent + Total Req: + plain: The total number of incoming requests from the address processing unit + after coalescing. + rst: The total number of incoming requests from the :ref:`address processing + unit ` after coalescing. + unit: Requests + Read Req: + plain: The total number of incoming read requests from the address processing + unit after coalescing per normalization unit. + rst: The total number of incoming read requests from the :ref:`address processing + unit ` after coalescing per :ref:`normalization unit ` + unit: Requests per normalization unit + Write Req: + plain: The total number of incoming write requests from the address processing + unit after coalescing per normalization unit. + rst: The total number of incoming write requests from the :ref:`address processing + unit ` after coalescing per :ref:`normalization unit ` + unit: Requests per normalization unit + Atomic Req: + plain: The total number of incoming atomic requests from the address processing + unit after coalescing per normalization unit. + rst: The total number of incoming atomic requests from the :ref:`address processing + unit ` after coalescing per :ref:`normalization unit ` + unit: Requests per normalization unit + Cache BW: + plain: The number of bytes looked up in the vL1D cache as a result of VMEM instructions + per normalization unit. The number of bytes is calculated as the number of + cache lines requested multiplied by the cache line size. This value does + not consider partial requests, so for instance, if only a single value is + requested in a cache line, the data movement will still be counted as a full + cache line. + rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM + ` instructions per :ref:`normalization unit `. The + number of bytes is calculated as the number of cache lines requested multiplied + by the cache line size. This value does not consider partial requests, so + for instance, if only a single value is requested in a cache line, the data movement + will still be counted as a full cache line. + unit: Bytes per normalization unit + Cache Hit Rate: + plain: The ratio of the number of vL1D cache line requests that hit in vL1D + cache over the total number of cache line requests to the vL1D Cache RAM. + rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache + over the total number of cache line requests to the :ref:`vL1D Cache RAM + `. + unit: Percent + Cache Accesses: + plain: The total number of cache line lookups in the vL1D. + rst: The total number of cache line lookups in the vL1D. + unit: Cache lines + Cache Hits: + plain: The number of cache accesses minus the number of outgoing requests to + the L2 cache, that is, the number of cache line requests serviced by the vL1D + Cache RAM per normalization unit. + rst: The number of cache accesses minus the number of outgoing requests to the :doc:`L2 + cache `, that is, the number of cache line requests serviced by + the :ref:`vL1D Cache RAM ` per :ref:`normalization unit `. + unit: Cache lines per normalization unit + Invalidations: + plain: The number of times the vL1D was issued a write-back invalidate command + during the kernel's execution per normalization unit. This may be triggered + by, for instance, the buffer_wbinvl1 instruction. + rst: The number of times the vL1D was issued a write-back invalidate command during + the kernel's execution per :ref:`normalization unit `. This + may be triggered by, for instance, the ``buffer_wbinvl1`` instruction. + unit: Invalidations per normalization unit + L1-L2 BW: + plain: The number of bytes transferred across the vL1D-L2 interface as a result + of VMEM instructions, per normalization unit. The number of bytes is calculated + as the number of cache lines requested multiplied by the cache line size. + This value does not consider partial requests, so for instance, if only a + single value is requested in a cache line, the data movement will still be + counted as a full cache line. + rst: The number of bytes transferred across the vL1D-L2 interface as a result of + :ref:`VMEM ` instructions, per :ref:`normalization unit `. + The number of bytes is calculated as the number of cache lines requested + multiplied by the cache line size. This value does not consider partial requests, + so for instance, if only a single value is requested in a cache line, the + data movement will still be counted as a full cache line. + unit: Bytes per normalization unit + L1-L2 Read: + plain: The number of read requests for a vL1D cache line that were not satisfied + by the vL1D and must be retrieved from the to the L2 Cache per normalization + unit. + rst: The number of read requests for a vL1D cache line that were not satisfied by + the vL1D and must be retrieved from the to the :doc:`L2 Cache ` + per :ref:`normalization unit `. + unit: Requests per normalization unit + L1-L2 Write: + plain: The number of write requests to a vL1D cache line that were sent through + the vL1D to the L2 cache, per normalization unit. + rst: The number of write requests to a vL1D cache line that were sent through the + vL1D to the :doc:`L2 cache `, per :ref:`normalization unit `. + unit: Requests per normalization unit + L1-L2 Atomic: + plain: The number of atomic requests that are sent through the vL1D to the L2 + cache, per normalization unit. This includes requests for atomics with, and + without return. + rst: The number of atomic requests that are sent through the vL1D to the :doc:`L2 + cache `, per :ref:`normalization unit `. This + includes requests for atomics with, and without return. + unit: Requests per normalization unit + L1 Access Latency: + plain: Calculated as the average number of cycles that a vL1D cache line request + spent in the vL1D cache pipeline. + rst: Calculated as the average number of cycles that a vL1D cache line request + spent in the vL1D cache pipeline. + unit: Cycles + L1-L2 Read Latency: + plain: Calculated as the average number of cycles that the vL1D cache took to + issue and receive read requests from the L2 Cache. This number also includes + requests for atomics with return values. + rst: Calculated as the average number of cycles that the vL1D cache took to issue + and receive read requests from the :doc:`L2 Cache `. This number + also includes requests for atomics with return values. + unit: Cycles + L1-L2 Write Latency: + plain: Calculated as the average number of cycles that the vL1D cache took to + issue and receive acknowledgement of a write request to the L2 Cache. This + number also includes requests for atomics without return values. + rst: Calculated as the average number of cycles that the vL1D cache took to issue + and receive acknowledgement of a write request to the :doc:`L2 Cache `. + This number also includes requests for atomics without return values. + unit: Cycles + NC - Read: + plain: Total read requests with NC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + rst: Total read requests with NC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + UC - Read: + plain: Total read requests with UC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + rst: Total read requests with UC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + CC - Read: + plain: Total read requests with CC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + rst: Total read requests with CC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + RW - Read: + plain: Total read requests with RW mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + rst: '' + unit: Requests per normalization unit + RW - Write: + plain: Total write requests with RW mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + rst: Total write requests with RW mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + NC - Write: + plain: Total write requests with NC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + rst: Total write requests with NC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + UC - Write: + plain: Total write requests with UC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + rst: Total write requests with UC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + CC - Write: + plain: Total write requests with CC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + rst: Total write requests with CC mtype from this TCP to all TCCs Sum over TCP + instances per normalization unit. + unit: Requests per normalization unit + NC - Atomic: + plain: Total atomic requests with NC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + rst: Total atomic requests with NC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + unit: Requests per normalization unit + UC - Atomic: + plain: Total atomic requests with UC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + rst: Total atomic requests with UC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + unit: Requests per normalization unit + CC - Atomic: + plain: Total atomic requests with CC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + rst: Total atomic requests with CC mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + unit: Requests per normalization unit + RW - Atomic: + plain: Total atomic requests with RW mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + rst: Total atomic requests with RW mtype from this TCP to all TCCs Sum over + TCP instances per normalization unit. + unit: Requests per normalization unit + Req: + plain: The number of translation requests made to the UTCL1 per normalization + unit. + rst: The number of translation requests made to the UTCL1 per normalization + unit. + unit: Requests per normalization unit + Hit Ratio: + plain: The ratio of the number of translation requests that hit in the UTCL1 + divided by the total number of translation requests made to the UTCL1. + rst: The ratio of the number of translation requests that hit in the UTCL1 divided + by the total number of translation requests made to the UTCL1. + unit: Percent + Hits: + plain: The number of translation requests that hit in the UTCL1, and could be + reused, per normalization unit. + rst: The number of translation requests that hit in the UTCL1, and could be + reused, per normalization unit. + unit: Requests per normalization unit + Translation Misses: + plain: The total number of translation requests that missed in the UTCL1 due + to translation not being present in the cache, per normalization unit. + rst: The total number of translation requests that missed in the UTCL1 due to translation + not being present in the cache, per :ref:`normalization unit `. + unit: unit + Permission Misses: + plain: "The total number of translation requests that missed in the UTCL1 due\ + \ to a permission error, per normalization unit. This is unused and expected\ + \ to be zero in most configurations for modern CDNA\u2122 accelerators." + rst: "The total number of translation requests that missed in the UTCL1 due\ + \ to a permission error, per :ref:`normalization unit `.\ + \ This is unused and expected to be zero in most configurations for modern\ + \ CDNA\u2122 accelerators." + unit: Requests per normalization unit +- id: 1700 + title: L2 Cache + data source: + - metric_table: + id: 1701 + title: L2 Speed-of-Light + header: + metric: Metric + value: Avg + unit: Unit + metric: + gfx90a: + Utilization: + value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD))) + unit: pct + Peak Bandwidth: + value: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) + / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) + unit: pct + Hit Rate: + value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else 0)) + unit: pct + L2-Fabric Read BW: + value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) + * 64)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + L2-Fabric Write and Atomic BW: + value: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) + * 32)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + HBM Bandwidth: + value: $hbmBandwidth + unit: GB/s + gfx941: + Utilization: + value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD))) + unit: pct + Peak Bandwidth: + value: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) + / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) + unit: pct + Hit Rate: + value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else 0)) + unit: pct + L2-Fabric Read BW: + value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) + * 64)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + L2-Fabric Write and Atomic BW: + value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + HBM Bandwidth: + value: $hbmBandwidth + unit: GB/s + gfx940: + Utilization: + value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD))) + unit: pct + Peak Bandwidth: + value: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) + / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) + unit: pct + Hit Rate: + value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else 0)) + unit: pct + L2-Fabric Read BW: + value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) + * 64)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + L2-Fabric Write and Atomic BW: + value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + HBM Bandwidth: + value: $hbmBandwidth + unit: GB/s + gfx942: + Utilization: + value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD))) + unit: pct + Peak Bandwidth: + value: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) + / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) + unit: pct + Hit Rate: + value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else 0)) + unit: pct + L2-Fabric Read BW: + value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum + - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp + - Start_Timestamp)) + unit: GB/s + L2-Fabric Write and Atomic BW: + value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + HBM Bandwidth: + value: $hbmBandwidth + unit: GB/s + gfx950: + Utilization: + value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD))) + unit: pct + Peak Bandwidth: + value: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) + / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) + unit: pct + Hit Rate: + value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else 0)) + unit: pct + L2-Fabric Read BW: + value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + L2-Fabric Write and Atomic BW: + value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + HBM Bandwidth: + value: $hbmBandwidth + unit: GB/s + gfx908: + Utilization: + value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD))) + unit: pct + Peak Bandwidth: + value: ((100 * AVG(((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp)))) + / ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan))) + unit: pct + Hit Rate: + value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else 0)) + unit: pct + L2-Fabric Read BW: + value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) + * 64)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + L2-Fabric Write and Atomic BW: + value: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) + * 32)) / (End_Timestamp - Start_Timestamp))) + unit: GB/s + HBM Bandwidth: + value: $hbmBandwidth + unit: GB/s + - metric_table: + id: 1702 + title: L2-Fabric interface metrics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + gfx90a: + Read BW: + avg: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) + * 64)) / $denom)) + min: MIN((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) + * 64)) / $denom)) + max: MAX((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) + * 64)) / $denom)) + unit: (Bytes + $normUnit) + HBM Read Traffic: + avg: AVG((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + unit: pct + Remote Read Traffic: + avg: AVG((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) + if (TCC_EA_RDREQ_sum != 0) else None)) + min: MIN((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) + if (TCC_EA_RDREQ_sum != 0) else None)) + max: MAX((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) + if (TCC_EA_RDREQ_sum != 0) else None)) + unit: pct + Uncached Read Traffic: + avg: AVG((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + unit: pct + Write and Atomic BW: + avg: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) + * 32)) / $denom)) + min: MIN((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) + * 32)) / $denom)) + max: MAX((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) + * 32)) / $denom)) + unit: (Bytes + $normUnit) + HBM Write and Atomic Traffic: + avg: AVG((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + unit: pct + Remote Write and Atomic Traffic: + avg: AVG((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) + if (TCC_EA_WRREQ_sum != 0) else None)) + min: MIN((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) + if (TCC_EA_WRREQ_sum != 0) else None)) + max: MAX((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) + if (TCC_EA_WRREQ_sum != 0) else None)) + unit: pct + Atomic Traffic: + avg: AVG((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + unit: pct + Uncached Write and Atomic Traffic: + avg: AVG((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + unit: pct + Read Latency: + avg: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + min: MIN(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + max: MAX(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + unit: Cycles + Write and Atomic Latency: + avg: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + min: MIN(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + max: MAX(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + unit: Cycles + Atomic Latency: + avg: AVG(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum + != 0) else None)) + min: MIN(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum + != 0) else None)) + max: MAX(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum + != 0) else None)) + unit: Cycles + gfx941: + Read BW: + avg: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) + * 64)) / $denom)) + min: MIN((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) + * 64)) / $denom)) + max: MAX((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) + * 64)) / $denom)) + unit: (Bytes + $normUnit) + HBM Read Traffic: + avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + unit: pct + Remote Read Traffic: + avg: AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) / + TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + min: MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) / + TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + max: MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) / + TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + unit: pct + Uncached Read Traffic: + avg: AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + unit: pct + Write and Atomic BW: + avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + unit: (Bytes + $normUnit) + HBM Write and Atomic Traffic: + avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: pct + Remote Write and Atomic Traffic: + avg: AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / + TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + min: MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / + TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + max: MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / + TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + unit: pct + Atomic Traffic: + avg: AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: pct + Uncached Write and Atomic Traffic: + avg: AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: pct + Read Latency: + avg: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + min: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + max: MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + unit: Cycles + Write and Atomic Latency: + avg: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + min: MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + max: MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: Cycles + Atomic Latency: + avg: AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else None)) + min: MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else None)) + max: MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else None)) + unit: Cycles + gfx940: + Read BW: + avg: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) + * 64)) / $denom)) + min: MIN((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) + * 64)) / $denom)) + max: MAX((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) + * 64)) / $denom)) + unit: (Bytes + $normUnit) + HBM Read Traffic: + avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + unit: pct + Remote Read Traffic: + avg: AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) + / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + min: MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) + / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + max: MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) + / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + unit: pct + Uncached Read Traffic: + avg: AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + unit: pct + Write and Atomic BW: + avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + unit: (Bytes + $normUnit) + HBM Write and Atomic Traffic: + avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: pct + Remote Write and Atomic Traffic: + avg: AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / + TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + min: MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / + TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + max: MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / + TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + unit: pct + Atomic Traffic: + avg: AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: pct + Uncached Write and Atomic Traffic: + avg: AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: pct + Read Latency: + avg: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + min: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + max: MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + unit: Cycles + Write and Atomic Latency: + avg: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + min: MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + max: MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: Cycles + Atomic Latency: + avg: AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else None)) + min: MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else None)) + max: MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else None)) + unit: Cycles + gfx942: + Read BW: + avg: AVG(((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum + - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / $denom)) + min: MIN(((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum + - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / $denom)) + max: MAX(((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum + - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / $denom)) + unit: (Bytes + $normUnit) + HBM Read Traffic: + avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + unit: pct + Remote Read Traffic: + avg: AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) / + TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + min: MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) / + TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + max: MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) / + TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + unit: pct + Uncached Read Traffic: + avg: AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + unit: pct + Write and Atomic BW: + avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + unit: (Bytes + $normUnit) + HBM Write and Atomic Traffic: + avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: pct + Remote Write and Atomic Traffic: + avg: AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / + TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + min: MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / + TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + max: MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / + TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + unit: pct + Atomic Traffic: + avg: AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: pct + Uncached Write and Atomic Traffic: + avg: AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: pct + Read Latency: + avg: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + min: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + max: MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + unit: Cycles + Write and Atomic Latency: + avg: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + min: MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + max: MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: Cycles + Atomic Latency: + avg: AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else None)) + min: MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else None)) + max: MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else None)) + unit: Cycles + gfx950: + Read BW: + avg: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + + (TCC_EA0_RDREQ_128B_sum * 128)) / $denom)) + min: MIN((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + + (TCC_EA0_RDREQ_128B_sum * 128)) / $denom)) + max: MAX((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + + (TCC_EA0_RDREQ_128B_sum * 128)) / $denom)) + unit: (Bytes + $normUnit) + HBM Read Traffic: + avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + unit: pct + Remote Read Traffic: + avg: AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) + / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + min: MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) + / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + max: MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) + / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + unit: pct + Uncached Read Traffic: + avg: AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + unit: pct + Write and Atomic BW: + avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) + * 32)) / $denom)) + unit: (Bytes + $normUnit) + HBM Write and Atomic Traffic: + avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: pct + Remote Write and Atomic Traffic: + avg: AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / + TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + min: MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / + TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + max: MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / + TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + unit: pct + Atomic Traffic: + avg: AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: pct + Uncached Write and Atomic Traffic: + avg: AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: pct + Read Latency: + avg: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + min: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + max: MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum + != 0) else None)) + unit: Cycles + Write and Atomic Latency: + avg: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + min: MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + max: MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum + != 0) else None)) + unit: Cycles + Atomic Latency: + avg: AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else None)) + min: MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else None)) + max: MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum + != 0) else None)) + unit: Cycles + Read Stall: + avg: AVG((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum) + + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum + != 0) else None)) + min: MIN((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum) + + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum + != 0) else None)) + max: MAX((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum) + + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum + != 0) else None)) + unit: pct + Write Stall: + avg: AVG(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum + != 0) else None)) + min: MIN(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum + != 0) else None)) + max: MAX(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum + != 0) else None)) + unit: pct + gfx908: + Read BW: + avg: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) + * 64)) / $denom)) + min: MIN((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) + * 64)) / $denom)) + max: MAX((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) + * 64)) / $denom)) + unit: (Bytes + $normUnit) + HBM Read Traffic: + avg: AVG((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + unit: pct + Remote Read Traffic: + avg: AVG((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) + if (TCC_EA_RDREQ_sum != 0) else None)) + min: MIN((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) + if (TCC_EA_RDREQ_sum != 0) else None)) + max: MAX((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) + if (TCC_EA_RDREQ_sum != 0) else None)) + unit: pct + Uncached Read Traffic: + avg: AVG((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + unit: pct + Write and Atomic BW: + avg: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) + * 32)) / $denom)) + min: MIN((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) + * 32)) / $denom)) + max: MAX((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) + * 32)) / $denom)) + unit: (Bytes + $normUnit) + HBM Write and Atomic Traffic: + avg: AVG((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + unit: pct + Remote Write and Atomic Traffic: + avg: AVG((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) + if (TCC_EA_WRREQ_sum != 0) else None)) + min: MIN((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) + if (TCC_EA_WRREQ_sum != 0) else None)) + max: MAX((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) + if (TCC_EA_WRREQ_sum != 0) else None)) + unit: pct + Atomic Traffic: + avg: AVG((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + unit: pct + Uncached Write and Atomic Traffic: + avg: AVG((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + min: MIN((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + max: MAX((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + unit: pct + Read Latency: + avg: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + min: MIN(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + max: MAX(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum + != 0) else None)) + unit: Cycles + Write and Atomic Latency: + avg: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + min: MIN(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + max: MAX(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum + != 0) else None)) + unit: Cycles + Atomic Latency: + avg: AVG(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum + != 0) else None)) + min: MIN(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum + != 0) else None)) + max: MAX(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum + != 0) else None)) + unit: Cycles + - metric_table: + id: 1703 + title: L2 Cache Accesses + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + gfx90a: + Bandwidth: + avg: AVG((TCC_REQ_sum * 128) / $denom) + min: MIN((TCC_REQ_sum * 128) / $denom) + max: MAX((TCC_REQ_sum * 128) / $denom) + unit: (Bytes + $normUnit) + Req: + avg: AVG((TCC_REQ_sum / $denom)) + min: MIN((TCC_REQ_sum / $denom)) + max: MAX((TCC_REQ_sum / $denom)) + unit: (Req + $normUnit) + Read Req: + avg: AVG((TCC_READ_sum / $denom)) + min: MIN((TCC_READ_sum / $denom)) + max: MAX((TCC_READ_sum / $denom)) + unit: (Req + $normUnit) + Write Req: + avg: AVG((TCC_WRITE_sum / $denom)) + min: MIN((TCC_WRITE_sum / $denom)) + max: MAX((TCC_WRITE_sum / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG((TCC_ATOMIC_sum / $denom)) + min: MIN((TCC_ATOMIC_sum / $denom)) + max: MAX((TCC_ATOMIC_sum / $denom)) + unit: (Req + $normUnit) + Streaming Req: + avg: AVG((TCC_STREAMING_REQ_sum / $denom)) + min: MIN((TCC_STREAMING_REQ_sum / $denom)) + max: MAX((TCC_STREAMING_REQ_sum / $denom)) + unit: (Req + $normUnit) + Probe Req: + avg: AVG((TCC_PROBE_sum / $denom)) + min: MIN((TCC_PROBE_sum / $denom)) + max: MAX((TCC_PROBE_sum / $denom)) + unit: (Req + $normUnit) + Cache Hit: + avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + unit: pct + Hits: + avg: AVG((TCC_HIT_sum / $denom)) + min: MIN((TCC_HIT_sum / $denom)) + max: MAX((TCC_HIT_sum / $denom)) + unit: (Hits + $normUnit) + Misses: + avg: AVG((TCC_MISS_sum / $denom)) + min: MIN((TCC_MISS_sum / $denom)) + max: MAX((TCC_MISS_sum / $denom)) + unit: (Misses + $normUnit) + Writeback: + avg: AVG((TCC_WRITEBACK_sum / $denom)) + min: MIN((TCC_WRITEBACK_sum / $denom)) + max: MAX((TCC_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + Writeback (Internal): + avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom)) + min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom)) + max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + Writeback (vL1D Req): + avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + Evict (Internal): + avg: AVG((TCC_NORMAL_EVICT_sum / $denom)) + min: MIN((TCC_NORMAL_EVICT_sum / $denom)) + max: MAX((TCC_NORMAL_EVICT_sum / $denom)) + unit: (Cachelines + $normUnit) + Evict (vL1D Req): + avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + unit: (Cachelines + $normUnit) + NC Req: + avg: AVG((TCC_NC_REQ_sum / $denom)) + min: MIN((TCC_NC_REQ_sum / $denom)) + max: MAX((TCC_NC_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC Req: + avg: AVG((TCC_UC_REQ_sum / $denom)) + min: MIN((TCC_UC_REQ_sum / $denom)) + max: MAX((TCC_UC_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC Req: + avg: AVG((TCC_CC_REQ_sum / $denom)) + min: MIN((TCC_CC_REQ_sum / $denom)) + max: MAX((TCC_CC_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW Req: + avg: AVG((TCC_RW_REQ_sum / $denom)) + min: MIN((TCC_RW_REQ_sum / $denom)) + max: MAX((TCC_RW_REQ_sum / $denom)) + unit: (Req + $normUnit) + gfx941: + Bandwidth: + avg: AVG((TCC_REQ_sum * 128) / $denom) + min: MIN((TCC_REQ_sum * 128) / $denom) + max: MAX((TCC_REQ_sum * 128) / $denom) + unit: (Bytes + $normUnit) + Req: + avg: AVG((TCC_REQ_sum / $denom)) + min: MIN((TCC_REQ_sum / $denom)) + max: MAX((TCC_REQ_sum / $denom)) + unit: (Req + $normUnit) + Read Req: + avg: AVG((TCC_READ_sum / $denom)) + min: MIN((TCC_READ_sum / $denom)) + max: MAX((TCC_READ_sum / $denom)) + unit: (Req + $normUnit) + Write Req: + avg: AVG((TCC_WRITE_sum / $denom)) + min: MIN((TCC_WRITE_sum / $denom)) + max: MAX((TCC_WRITE_sum / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG((TCC_ATOMIC_sum / $denom)) + min: MIN((TCC_ATOMIC_sum / $denom)) + max: MAX((TCC_ATOMIC_sum / $denom)) + unit: (Req + $normUnit) + Streaming Req: + avg: AVG((TCC_STREAMING_REQ_sum / $denom)) + min: MIN((TCC_STREAMING_REQ_sum / $denom)) + max: MAX((TCC_STREAMING_REQ_sum / $denom)) + unit: (Req + $normUnit) + Probe Req: + avg: AVG((TCC_PROBE_sum / $denom)) + min: MIN((TCC_PROBE_sum / $denom)) + max: MAX((TCC_PROBE_sum / $denom)) + unit: (Req + $normUnit) + Cache Hit: + avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + unit: pct + Hits: + avg: AVG((TCC_HIT_sum / $denom)) + min: MIN((TCC_HIT_sum / $denom)) + max: MAX((TCC_HIT_sum / $denom)) + unit: (Hits + $normUnit) + Misses: + avg: AVG((TCC_MISS_sum / $denom)) + min: MIN((TCC_MISS_sum / $denom)) + max: MAX((TCC_MISS_sum / $denom)) + unit: (Misses + $normUnit) + Writeback: + avg: AVG((TCC_WRITEBACK_sum / $denom)) + min: MIN((TCC_WRITEBACK_sum / $denom)) + max: MAX((TCC_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + Writeback (Internal): + avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom)) + min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom)) + max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + Writeback (vL1D Req): + avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + Evict (Internal): + avg: AVG((TCC_NORMAL_EVICT_sum / $denom)) + min: MIN((TCC_NORMAL_EVICT_sum / $denom)) + max: MAX((TCC_NORMAL_EVICT_sum / $denom)) + unit: (Cachelines + $normUnit) + Evict (vL1D Req): + avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + unit: (Cachelines + $normUnit) + NC Req: + avg: AVG((TCC_NC_REQ_sum / $denom)) + min: MIN((TCC_NC_REQ_sum / $denom)) + max: MAX((TCC_NC_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC Req: + avg: AVG((TCC_UC_REQ_sum / $denom)) + min: MIN((TCC_UC_REQ_sum / $denom)) + max: MAX((TCC_UC_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC Req: + avg: AVG((TCC_CC_REQ_sum / $denom)) + min: MIN((TCC_CC_REQ_sum / $denom)) + max: MAX((TCC_CC_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW Req: + avg: AVG((TCC_RW_REQ_sum / $denom)) + min: MIN((TCC_RW_REQ_sum / $denom)) + max: MAX((TCC_RW_REQ_sum / $denom)) + unit: (Req + $normUnit) + gfx940: + Bandwidth: + avg: AVG((TCC_REQ_sum * 128) / $denom) + min: MIN((TCC_REQ_sum * 128) / $denom) + max: MAX((TCC_REQ_sum * 128) / $denom) + unit: (Bytes + $normUnit) + Req: + avg: AVG((TCC_REQ_sum / $denom)) + min: MIN((TCC_REQ_sum / $denom)) + max: MAX((TCC_REQ_sum / $denom)) + unit: (Req + $normUnit) + Read Req: + avg: AVG((TCC_READ_sum / $denom)) + min: MIN((TCC_READ_sum / $denom)) + max: MAX((TCC_READ_sum / $denom)) + unit: (Req + $normUnit) + Write Req: + avg: AVG((TCC_WRITE_sum / $denom)) + min: MIN((TCC_WRITE_sum / $denom)) + max: MAX((TCC_WRITE_sum / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG((TCC_ATOMIC_sum / $denom)) + min: MIN((TCC_ATOMIC_sum / $denom)) + max: MAX((TCC_ATOMIC_sum / $denom)) + unit: (Req + $normUnit) + Streaming Req: + avg: AVG((TCC_STREAMING_REQ_sum / $denom)) + min: MIN((TCC_STREAMING_REQ_sum / $denom)) + max: MAX((TCC_STREAMING_REQ_sum / $denom)) + unit: (Req + $normUnit) + Probe Req: + avg: AVG((TCC_PROBE_sum / $denom)) + min: MIN((TCC_PROBE_sum / $denom)) + max: MAX((TCC_PROBE_sum / $denom)) + unit: (Req + $normUnit) + Cache Hit: + avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + unit: pct + Hits: + avg: AVG((TCC_HIT_sum / $denom)) + min: MIN((TCC_HIT_sum / $denom)) + max: MAX((TCC_HIT_sum / $denom)) + unit: (Hits + $normUnit) + Misses: + avg: AVG((TCC_MISS_sum / $denom)) + min: MIN((TCC_MISS_sum / $denom)) + max: MAX((TCC_MISS_sum / $denom)) + unit: (Misses + $normUnit) + Writeback: + avg: AVG((TCC_WRITEBACK_sum / $denom)) + min: MIN((TCC_WRITEBACK_sum / $denom)) + max: MAX((TCC_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + Writeback (Internal): + avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom)) + min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom)) + max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + Writeback (vL1D Req): + avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + Evict (Internal): + avg: AVG((TCC_NORMAL_EVICT_sum / $denom)) + min: MIN((TCC_NORMAL_EVICT_sum / $denom)) + max: MAX((TCC_NORMAL_EVICT_sum / $denom)) + unit: (Cachelines + $normUnit) + Evict (vL1D Req): + avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + unit: (Cachelines + $normUnit) + NC Req: + avg: AVG((TCC_NC_REQ_sum / $denom)) + min: MIN((TCC_NC_REQ_sum / $denom)) + max: MAX((TCC_NC_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC Req: + avg: AVG((TCC_UC_REQ_sum / $denom)) + min: MIN((TCC_UC_REQ_sum / $denom)) + max: MAX((TCC_UC_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC Req: + avg: AVG((TCC_CC_REQ_sum / $denom)) + min: MIN((TCC_CC_REQ_sum / $denom)) + max: MAX((TCC_CC_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW Req: + avg: AVG((TCC_RW_REQ_sum / $denom)) + min: MIN((TCC_RW_REQ_sum / $denom)) + max: MAX((TCC_RW_REQ_sum / $denom)) + unit: (Req + $normUnit) + gfx942: + Bandwidth: + avg: AVG((TCC_REQ_sum * 128) / $denom) + min: MIN((TCC_REQ_sum * 128) / $denom) + max: MAX((TCC_REQ_sum * 128) / $denom) + unit: (Bytes + $normUnit) + Req: + avg: AVG((TCC_REQ_sum / $denom)) + min: MIN((TCC_REQ_sum / $denom)) + max: MAX((TCC_REQ_sum / $denom)) + unit: (Req + $normUnit) + Read Req: + avg: AVG((TCC_READ_sum / $denom)) + min: MIN((TCC_READ_sum / $denom)) + max: MAX((TCC_READ_sum / $denom)) + unit: (Req + $normUnit) + Write Req: + avg: AVG((TCC_WRITE_sum / $denom)) + min: MIN((TCC_WRITE_sum / $denom)) + max: MAX((TCC_WRITE_sum / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG((TCC_ATOMIC_sum / $denom)) + min: MIN((TCC_ATOMIC_sum / $denom)) + max: MAX((TCC_ATOMIC_sum / $denom)) + unit: (Req + $normUnit) + Streaming Req: + avg: AVG((TCC_STREAMING_REQ_sum / $denom)) + min: MIN((TCC_STREAMING_REQ_sum / $denom)) + max: MAX((TCC_STREAMING_REQ_sum / $denom)) + unit: (Req + $normUnit) + Probe Req: + avg: AVG((TCC_PROBE_sum / $denom)) + min: MIN((TCC_PROBE_sum / $denom)) + max: MAX((TCC_PROBE_sum / $denom)) + unit: (Req + $normUnit) + Cache Hit: + avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + unit: pct + Hits: + avg: AVG((TCC_HIT_sum / $denom)) + min: MIN((TCC_HIT_sum / $denom)) + max: MAX((TCC_HIT_sum / $denom)) + unit: (Hits + $normUnit) + Misses: + avg: AVG((TCC_MISS_sum / $denom)) + min: MIN((TCC_MISS_sum / $denom)) + max: MAX((TCC_MISS_sum / $denom)) + unit: (Misses + $normUnit) + Writeback: + avg: AVG((TCC_WRITEBACK_sum / $denom)) + min: MIN((TCC_WRITEBACK_sum / $denom)) + max: MAX((TCC_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + Writeback (Internal): + avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom)) + min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom)) + max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + Writeback (vL1D Req): + avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + Evict (Internal): + avg: AVG((TCC_NORMAL_EVICT_sum / $denom)) + min: MIN((TCC_NORMAL_EVICT_sum / $denom)) + max: MAX((TCC_NORMAL_EVICT_sum / $denom)) + unit: (Cachelines + $normUnit) + Evict (vL1D Req): + avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + unit: (Cachelines + $normUnit) + NC Req: + avg: AVG((TCC_NC_REQ_sum / $denom)) + min: MIN((TCC_NC_REQ_sum / $denom)) + max: MAX((TCC_NC_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC Req: + avg: AVG((TCC_UC_REQ_sum / $denom)) + min: MIN((TCC_UC_REQ_sum / $denom)) + max: MAX((TCC_UC_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC Req: + avg: AVG((TCC_CC_REQ_sum / $denom)) + min: MIN((TCC_CC_REQ_sum / $denom)) + max: MAX((TCC_CC_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW Req: + avg: AVG((TCC_RW_REQ_sum / $denom)) + min: MIN((TCC_RW_REQ_sum / $denom)) + max: MAX((TCC_RW_REQ_sum / $denom)) + unit: (Req + $normUnit) + gfx950: + Bandwidth: + avg: AVG((TCC_REQ_sum * 128) / $denom) + min: MIN((TCC_REQ_sum * 128) / $denom) + max: MAX((TCC_REQ_sum * 128) / $denom) + unit: (Bytes + $normUnit) + Read Bandwidth: + avg: AVG(TCC_READ_SECTORS_sum * 32/ $denom) + min: MIN(TCC_READ_SECTORS_sum * 32/ $denom) + max: MAX(TCC_READ_SECTORS_sum * 32/ $denom) + unit: (Bytes + $normUnit) + Write Bandwidth: + avg: AVG(TCC_WRITE_SECTORS_sum * 32/ $denom) + min: MIN(TCC_WRITE_SECTORS_sum * 32/ $denom) + max: MAX(TCC_WRITE_SECTORS_sum * 32/ $denom) + unit: (Bytes + $normUnit) + Atomic Bandwidth: + avg: AVG(TCC_ATOMIC_SECTORS_sum * 32/ $denom) + min: MIN(TCC_ATOMIC_SECTORS_sum * 32/ $denom) + max: MAX(TCC_ATOMIC_SECTORS_sum * 32/ $denom) + unit: (Bytes + $normUnit) + Req: + avg: AVG((TCC_REQ_sum / $denom)) + min: MIN((TCC_REQ_sum / $denom)) + max: MAX((TCC_REQ_sum / $denom)) + unit: (Req + $normUnit) + Read Req: + avg: AVG((TCC_READ_sum / $denom)) + min: MIN((TCC_READ_sum / $denom)) + max: MAX((TCC_READ_sum / $denom)) + unit: (Req + $normUnit) + Write Req: + avg: AVG((TCC_WRITE_sum / $denom)) + min: MIN((TCC_WRITE_sum / $denom)) + max: MAX((TCC_WRITE_sum / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG((TCC_ATOMIC_sum / $denom)) + min: MIN((TCC_ATOMIC_sum / $denom)) + max: MAX((TCC_ATOMIC_sum / $denom)) + unit: (Req + $normUnit) + Streaming Req: + avg: AVG((TCC_STREAMING_REQ_sum / $denom)) + min: MIN((TCC_STREAMING_REQ_sum / $denom)) + max: MAX((TCC_STREAMING_REQ_sum / $denom)) + unit: (Req + $normUnit) + Bypasss Req: + avg: AVG((TCC_BYPASS_REQ_sum / $denom)) + min: MIN((TCC_BYPASS_REQ_sum / $denom)) + max: MAX((TCC_BYPASS_REQ_sum / $denom)) + unit: (Req + $normUnit) + Probe Req: + avg: AVG((TCC_PROBE_sum / $denom)) + min: MIN((TCC_PROBE_sum / $denom)) + max: MAX((TCC_PROBE_sum / $denom)) + unit: (Req + $normUnit) + Input Buffer Req: + avg: AVG((TCC_IB_REQ_sum / $denom)) + min: MIN((TCC_IB_REQ_sum / $denom)) + max: MAX((TCC_IB_REQ_sum / $denom)) + unit: (Req + $normUnit) + Cache Hit: + avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + unit: pct + Hits: + avg: AVG((TCC_HIT_sum / $denom)) + min: MIN((TCC_HIT_sum / $denom)) + max: MAX((TCC_HIT_sum / $denom)) + unit: (Hits + $normUnit) + Misses: + avg: AVG((TCC_MISS_sum / $denom)) + min: MIN((TCC_MISS_sum / $denom)) + max: MAX((TCC_MISS_sum / $denom)) + unit: (Misses + $normUnit) + Writeback: + avg: AVG((TCC_WRITEBACK_sum / $denom)) + min: MIN((TCC_WRITEBACK_sum / $denom)) + max: MAX((TCC_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + Writeback (Internal): + avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom)) + min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom)) + max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + Writeback (vL1D Req): + avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + Evict (Internal): + avg: AVG((TCC_NORMAL_EVICT_sum / $denom)) + min: MIN((TCC_NORMAL_EVICT_sum / $denom)) + max: MAX((TCC_NORMAL_EVICT_sum / $denom)) + unit: (Cachelines + $normUnit) + Evict (vL1D Req): + avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + unit: (Cachelines + $normUnit) + NC Req: + avg: AVG((TCC_NC_REQ_sum / $denom)) + min: MIN((TCC_NC_REQ_sum / $denom)) + max: MAX((TCC_NC_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC Req: + avg: AVG((TCC_UC_REQ_sum / $denom)) + min: MIN((TCC_UC_REQ_sum / $denom)) + max: MAX((TCC_UC_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC Req: + avg: AVG((TCC_CC_REQ_sum / $denom)) + min: MIN((TCC_CC_REQ_sum / $denom)) + max: MAX((TCC_CC_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW Req: + avg: AVG((TCC_RW_REQ_sum / $denom)) + min: MIN((TCC_RW_REQ_sum / $denom)) + max: MAX((TCC_RW_REQ_sum / $denom)) + unit: (Req + $normUnit) + gfx908: + Bandwidth: + avg: AVG((TCC_REQ_sum * 64) / $denom) + min: MIN((TCC_REQ_sum * 64) / $denom) + max: MAX((TCC_REQ_sum * 64) / $denom) + unit: (Bytes + $normUnit) + Req: + avg: AVG((TCC_REQ_sum / $denom)) + min: MIN((TCC_REQ_sum / $denom)) + max: MAX((TCC_REQ_sum / $denom)) + unit: (Req + $normUnit) + Read Req: + avg: AVG((TCC_READ_sum / $denom)) + min: MIN((TCC_READ_sum / $denom)) + max: MAX((TCC_READ_sum / $denom)) + unit: (Req + $normUnit) + Write Req: + avg: AVG((TCC_WRITE_sum / $denom)) + min: MIN((TCC_WRITE_sum / $denom)) + max: MAX((TCC_WRITE_sum / $denom)) + unit: (Req + $normUnit) + Atomic Req: + avg: AVG((TCC_ATOMIC_sum / $denom)) + min: MIN((TCC_ATOMIC_sum / $denom)) + max: MAX((TCC_ATOMIC_sum / $denom)) + unit: (Req + $normUnit) + Streaming Req: + avg: AVG((TCC_STREAMING_REQ_sum / $denom)) + min: MIN((TCC_STREAMING_REQ_sum / $denom)) + max: MAX((TCC_STREAMING_REQ_sum / $denom)) + unit: (Req + $normUnit) + Probe Req: + avg: AVG((TCC_PROBE_sum / $denom)) + min: MIN((TCC_PROBE_sum / $denom)) + max: MAX((TCC_PROBE_sum / $denom)) + unit: (Req + $normUnit) + Cache Hit: + avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + + TCC_MISS_sum) != 0) else None)) + unit: pct + Hits: + avg: AVG((TCC_HIT_sum / $denom)) + min: MIN((TCC_HIT_sum / $denom)) + max: MAX((TCC_HIT_sum / $denom)) + unit: (Hits + $normUnit) + Misses: + avg: AVG((TCC_MISS_sum / $denom)) + min: MIN((TCC_MISS_sum / $denom)) + max: MAX((TCC_MISS_sum / $denom)) + unit: (Misses + $normUnit) + Writeback: + avg: AVG((TCC_WRITEBACK_sum / $denom)) + min: MIN((TCC_WRITEBACK_sum / $denom)) + max: MAX((TCC_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + Writeback (Internal): + avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom)) + min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom)) + max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + Writeback (vL1D Req): + avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) + unit: (Cachelines + $normUnit) + Evict (Internal): + avg: AVG((TCC_NORMAL_EVICT_sum / $denom)) + min: MIN((TCC_NORMAL_EVICT_sum / $denom)) + max: MAX((TCC_NORMAL_EVICT_sum / $denom)) + unit: (Cachelines + $normUnit) + Evict (vL1D Req): + avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) + unit: (Cachelines + $normUnit) + NC Req: + avg: AVG((TCC_NC_REQ_sum / $denom)) + min: MIN((TCC_NC_REQ_sum / $denom)) + max: MAX((TCC_NC_REQ_sum / $denom)) + unit: (Req + $normUnit) + UC Req: + avg: AVG((TCC_UC_REQ_sum / $denom)) + min: MIN((TCC_UC_REQ_sum / $denom)) + max: MAX((TCC_UC_REQ_sum / $denom)) + unit: (Req + $normUnit) + CC Req: + avg: AVG((TCC_CC_REQ_sum / $denom)) + min: MIN((TCC_CC_REQ_sum / $denom)) + max: MAX((TCC_CC_REQ_sum / $denom)) + unit: (Req + $normUnit) + RW Req: + avg: AVG((TCC_RW_REQ_sum / $denom)) + min: MIN((TCC_RW_REQ_sum / $denom)) + max: MAX((TCC_RW_REQ_sum / $denom)) + unit: (Req + $normUnit) + - metric_table: + id: 1704 + title: L2 Cache Stalls + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + gfx90a: {} + gfx941: {} + gfx940: {} + gfx942: {} + gfx950: + Stalled on Latency FIFO: + avg: AVG(TCC_LATENCY_FIFO_FULL_sum / $denom) + min: MIN(TCC_LATENCY_FIFO_FULL_sum / $denom) + max: MAX(TCC_LATENCY_FIFO_FULL_sum / $denom) + unit: (Cycles + $normUnit) + Stalled on Write Data FIFO: + avg: AVG(TCC_SRC_FIFO_FULL_sum / $denom) + min: MIN(TCC_SRC_FIFO_FULL_sum / $denom) + max: MAX(TCC_SRC_FIFO_FULL_sum / $denom) + unit: (Cycles + $normUnit) + Input Buffer Stalled on L2: + avg: AVG(TCC_IB_STALL_sum / $denom) + min: MIN(TCC_IB_STALL_sum / $denom) + max: MAX(TCC_IB_STALL_sum / $denom) + unit: (Cycles + $normUnit) + gfx908: {} + - metric_table: + id: 1705 + title: L2 - Fabric Interface stalls + header: + metric: Metric + type: Type + transaction: Transaction + avg: Avg + min: Min + max: Max + unit: Unit + style: + type: simple_multi_bar + metric: + gfx90a: + Write - Credit Starvation: + type: Credit Starvation + transaction: Write + avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + unit: pct + gfx941: + Write - Credit Starvation: + type: Credit Starvation + transaction: Write + avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + unit: pct + gfx940: + Write - Credit Starvation: + type: Credit Starvation + transaction: Write + avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + unit: pct + gfx942: + Write - Credit Starvation: + type: Credit Starvation + transaction: Write + avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + unit: pct + gfx950: + Read - PCIe Stall: + type: PCIe Stall + transaction: Read + avg: AVG(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + unit: pct + Read - Infinity Fabric Stall: + type: "Infinity Fabric\u2122 Stall" + transaction: Read + avg: AVG(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) + if (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) + if (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) + if (TCC_BUSY_sum != 0) else None)) + unit: pct + Read - HBM Stall: + type: HBM Stall + transaction: Read + avg: AVG(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) + if (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) + if (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) + if (TCC_BUSY_sum != 0) else None)) + unit: pct + Write - PCIe Stall: + type: PCIe Stall + transaction: Write + avg: AVG(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + unit: pct + Write - Infinity Fabric Stall: + type: "Infinity Fabric\u2122 Stall" + transaction: Write + avg: AVG(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) + if (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) + if (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) + if (TCC_BUSY_sum != 0) else None)) + unit: pct + Write - HBM Stall: + type: HBM Stall + transaction: Write + avg: AVG(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) + if (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) + if (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) + if (TCC_BUSY_sum != 0) else None)) + unit: pct + Write - Credit Starvation: + type: Credit Starvation + transaction: Write + avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + unit: pct + gfx908: + Write - Credit Starvation: + type: Credit Starvation + transaction: Write + avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if + (TCC_BUSY_sum != 0) else None)) + unit: pct + - metric_table: + id: 1706 + title: L2 - Fabric interface detailed metrics + header: + metric: Metric + avg: Avg + min: Min + max: Max + unit: Unit + metric: + gfx90a: + Read (32B): + avg: AVG((TCC_EA_RDREQ_32B_sum / $denom)) + min: MIN((TCC_EA_RDREQ_32B_sum / $denom)) + max: MAX((TCC_EA_RDREQ_32B_sum / $denom)) + unit: (Req + $normUnit) + Read (64B): + avg: AVG(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) + min: MIN(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) + max: MAX(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) + unit: (Req + $normUnit) + Read (Uncached): + avg: AVG((TCC_EA_RD_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA_RD_UNCACHED_32B_sum / $denom)) + max: MAX((TCC_EA_RD_UNCACHED_32B_sum / $denom)) + unit: (Req + $normUnit) + HBM Read: + avg: AVG((TCC_EA_RDREQ_DRAM_sum / $denom)) + min: MIN((TCC_EA_RDREQ_DRAM_sum / $denom)) + max: MAX((TCC_EA_RDREQ_DRAM_sum / $denom)) + unit: (Req + $normUnit) + Remote Read: + avg: AVG((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom)) + min: MIN((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom)) + max: MAX((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom)) + unit: (Req + $normUnit) + Write and Atomic (32B): + avg: AVG(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) + min: MIN(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) + max: MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) + unit: (Req + $normUnit) + Write and Atomic (Uncached): + avg: AVG((TCC_EA_WR_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA_WR_UNCACHED_32B_sum / $denom)) + max: MAX((TCC_EA_WR_UNCACHED_32B_sum / $denom)) + unit: (Req + $normUnit) + Write and Atomic (64B): + avg: AVG((TCC_EA_WRREQ_64B_sum / $denom)) + min: MIN((TCC_EA_WRREQ_64B_sum / $denom)) + max: MAX((TCC_EA_WRREQ_64B_sum / $denom)) + unit: (Req + $normUnit) + HBM Write and Atomic: + avg: AVG((TCC_EA_WRREQ_DRAM_sum / $denom)) + min: MIN((TCC_EA_WRREQ_DRAM_sum / $denom)) + max: MAX((TCC_EA_WRREQ_DRAM_sum / $denom)) + unit: (Req + $normUnit) + Remote Write and Atomic: + avg: AVG((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom)) + min: MIN((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom)) + max: MAX((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom)) + unit: (Req + $normUnit) + Atomic: + avg: AVG((TCC_EA_ATOMIC_sum / $denom)) + min: MIN((TCC_EA_ATOMIC_sum / $denom)) + max: MAX((TCC_EA_ATOMIC_sum / $denom)) + unit: (Req + $normUnit) + gfx941: + Read (32B): + avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom)) + min: MIN((TCC_EA0_RDREQ_32B_sum / $denom)) + max: MAX((TCC_EA0_RDREQ_32B_sum / $denom)) + unit: (Req + $normUnit) + Read (64B): + avg: AVG(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) + min: MIN(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) + max: MAX(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) + unit: (Req + $normUnit) + Read (Uncached): + avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) + max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) + unit: (Req + $normUnit) + HBM Read: + avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)) + min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom)) + max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom)) + unit: (Req + $normUnit) + Remote Read: + avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) + min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) + max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) + unit: (Req + $normUnit) + Write and Atomic (32B): + avg: AVG(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) + min: MIN(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) + max: MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) + unit: (Req + $normUnit) + Write and Atomic (Uncached): + avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) + max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) + unit: (Req + $normUnit) + Write and Atomic (64B): + avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom)) + min: MIN((TCC_EA0_WRREQ_64B_sum / $denom)) + max: MAX((TCC_EA0_WRREQ_64B_sum / $denom)) + unit: (Req + $normUnit) + HBM Write and Atomic: + avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)) + min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom)) + max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom)) + unit: (Req + $normUnit) + Remote Write and Atomic: + avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) + min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) + max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) + unit: (Req + $normUnit) + Atomic: + avg: AVG((TCC_EA0_ATOMIC_sum / $denom)) + min: MIN((TCC_EA0_ATOMIC_sum / $denom)) + max: MAX((TCC_EA0_ATOMIC_sum / $denom)) + unit: (Req + $normUnit) + gfx940: + Read (32B): + avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom)) + min: MIN((TCC_EA0_RDREQ_32B_sum / $denom)) + max: MAX((TCC_EA0_RDREQ_32B_sum / $denom)) + unit: (Req + $normUnit) + Read (64B): + avg: AVG(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) + min: MIN(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) + max: MAX(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) + unit: (Req + $normUnit) + Read (Uncached): + avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) + max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) + unit: (Req + $normUnit) + HBM Read: + avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)) + min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom)) + max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom)) + unit: (Req + $normUnit) + Remote Read: + avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) + min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) + max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) + unit: (Req + $normUnit) + Write and Atomic (32B): + avg: AVG(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) + min: MIN(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) + max: MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) + unit: (Req + $normUnit) + Write and Atomic (Uncached): + avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) + max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) + unit: (Req + $normUnit) + Write and Atomic (64B): + avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom)) + min: MIN((TCC_EA0_WRREQ_64B_sum / $denom)) + max: MAX((TCC_EA0_WRREQ_64B_sum / $denom)) + unit: (Req + $normUnit) + HBM Write and Atomic: + avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)) + min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom)) + max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom)) + unit: (Req + $normUnit) + Remote Write and Atomic: + avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) + min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) + max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) + unit: (Req + $normUnit) + Atomic: + avg: AVG((TCC_EA0_ATOMIC_sum / $denom)) + min: MIN((TCC_EA0_ATOMIC_sum / $denom)) + max: MAX((TCC_EA0_ATOMIC_sum / $denom)) + unit: (Req + $normUnit) + gfx942: + Read (32B): + avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom)) + min: MIN((TCC_EA0_RDREQ_32B_sum / $denom)) + max: MAX((TCC_EA0_RDREQ_32B_sum / $denom)) + unit: (Req + $normUnit) + Read (64B): + avg: AVG(((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + / $denom)) + min: MIN(((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + / $denom)) + max: MAX(((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + / $denom)) + unit: (Req + $normUnit) + Read (128B): + avg: AVG(((TCC_BUBBLE_sum) / $denom)) + min: MIN(((TCC_BUBBLE_sum) / $denom)) + max: MAX(((TCC_BUBBLE_sum) / $denom)) + unit: (Req + $normUnit) + Read (Uncached): + avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) + max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) + unit: (Req + $normUnit) + HBM Read: + avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)) + min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom)) + max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom)) + unit: (Req + $normUnit) + Remote Read: + avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) + min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) + max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) + unit: (Req + $normUnit) + Write and Atomic (32B): + avg: AVG(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) + min: MIN(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) + max: MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) + unit: (Req + $normUnit) + Write and Atomic (Uncached): + avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) + max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) + unit: (Req + $normUnit) + Write and Atomic (64B): + avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom)) + min: MIN((TCC_EA0_WRREQ_64B_sum / $denom)) + max: MAX((TCC_EA0_WRREQ_64B_sum / $denom)) + unit: (Req + $normUnit) + HBM Write and Atomic: + avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)) + min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom)) + max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom)) + unit: (Req + $normUnit) + Remote Write and Atomic: + avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) + min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) + max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) + unit: (Req + $normUnit) + Atomic: + avg: AVG((TCC_EA0_ATOMIC_sum / $denom)) + min: MIN((TCC_EA0_ATOMIC_sum / $denom)) + max: MAX((TCC_EA0_ATOMIC_sum / $denom)) + unit: (Req + $normUnit) + gfx950: + Read (32B): + avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom)) + min: MIN((TCC_EA0_RDREQ_32B_sum / $denom)) + max: MAX((TCC_EA0_RDREQ_32B_sum / $denom)) + unit: (Req + $normUnit) + Read (64B): + avg: AVG((TCC_EA0_RDREQ_64B_sum / $denom)) + min: MIN((TCC_EA0_RDREQ_64B_sum / $denom)) + max: MAX((TCC_EA0_RDREQ_64B_sum / $denom)) + unit: (Req + $normUnit) + Read (128B): + avg: AVG((TCC_EA0_RDREQ_128B_sum / $denom)) + min: MIN((TCC_EA0_RDREQ_128B_sum / $denom)) + max: MAX((TCC_EA0_RDREQ_128B_sum / $denom)) + unit: (Req + $normUnit) + Read (Uncached): + avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) + max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) + unit: (Req + $normUnit) + HBM Read: + avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)) + min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom)) + max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom)) + unit: (Req + $normUnit) + Remote Read: + avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) + min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) + max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) + unit: (Req + $normUnit) + Write and Atomic (32B): + avg: AVG(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) + min: MIN(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) + max: MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom)) + unit: (Req + $normUnit) + Write and Atomic (Uncached): + avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) + max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) + unit: (Req + $normUnit) + Write and Atomic (64B): + avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom)) + min: MIN((TCC_EA0_WRREQ_64B_sum / $denom)) + max: MAX((TCC_EA0_WRREQ_64B_sum / $denom)) + unit: (Req + $normUnit) + HBM Write and Atomic: + avg: AVG((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) + min: MIN((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) + max: MAX((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) + unit: (Req + $normUnit) + Remote Write and Atomic: + avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) + min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) + max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) + unit: (Req + $normUnit) + Write Bandwidth - PCIe: + avg: AVG(TCC_EA0_WRREQ_WRITE_IO_32B_sum / $denom) + min: MIN(TCC_EA0_WRREQ_WRITE_IO_32B_sum / $denom) + max: MAX(TCC_EA0_WRREQ_WRITE_IO_32B_sum / $denom) + unit: (Bytes + $normUnit) + "Write Bandwidth - Infinity Fabric\u2122": + avg: AVG(TCC_EA0_WRREQ_WRITE_GMI_32B_sum / $denom) + min: MIN(TCC_EA0_WRREQ_WRITE_GMI_32B_sum / $denom) + max: MAX(TCC_EA0_WRREQ_WRITE_GMI_32B_sum / $denom) + unit: (Bytes + $normUnit) + Write Bandwidth - HBM: + avg: AVG(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum / $denom) + min: MIN(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum / $denom) + max: MAX(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum / $denom) + unit: (Bytes + $normUnit) + Atomic: + avg: AVG((TCC_EA0_ATOMIC_sum / $denom)) + min: MIN((TCC_EA0_ATOMIC_sum / $denom)) + max: MAX((TCC_EA0_ATOMIC_sum / $denom)) + unit: (Req + $normUnit) + Atomic - HBM: + avg: AVG((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom)) + min: MIN((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom)) + max: MAX((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom)) + unit: (Req + $normUnit) + Atomic Bandwidth - PCIe: + avg: AVG(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum / $denom) + min: MIN(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum / $denom) + max: MAX(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum / $denom) + unit: (Bytes + $normUnit) + "Atomic Bandwidth - Infinity Fabric\u2122": + avg: AVG(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum / $denom) + min: MIN(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum / $denom) + max: MAX(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum / $denom) + unit: (Bytes + $normUnit) + Atomic Bandwidth - HBM: + avg: AVG(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum / $denom) + min: MIN(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum / $denom) + max: MAX(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum / $denom) + unit: (Bytes + $normUnit) + gfx908: + Read (32B): + avg: AVG((TCC_EA_RDREQ_32B_sum / $denom)) + min: MIN((TCC_EA_RDREQ_32B_sum / $denom)) + max: MAX((TCC_EA_RDREQ_32B_sum / $denom)) + unit: (Req + $normUnit) + Read (64B): + avg: AVG(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) + min: MIN(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) + max: MAX(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) + unit: (Req + $normUnit) + Read (Uncached): + avg: AVG((TCC_EA_RD_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA_RD_UNCACHED_32B_sum / $denom)) + max: MAX((TCC_EA_RD_UNCACHED_32B_sum / $denom)) + unit: (Req + $normUnit) + HBM Read: + avg: AVG((TCC_EA_RDREQ_DRAM_sum / $denom)) + min: MIN((TCC_EA_RDREQ_DRAM_sum / $denom)) + max: MAX((TCC_EA_RDREQ_DRAM_sum / $denom)) + unit: (Req + $normUnit) + Remote Read: + avg: AVG((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom)) + min: MIN((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom)) + max: MAX((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom)) + unit: (Req + $normUnit) + Write and Atomic (32B): + avg: AVG(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) + min: MIN(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) + max: MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom)) + unit: (Req + $normUnit) + Write and Atomic (Uncached): + avg: AVG((TCC_EA_WR_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA_WR_UNCACHED_32B_sum / $denom)) + max: MAX((TCC_EA_WR_UNCACHED_32B_sum / $denom)) + unit: (Req + $normUnit) + Write and Atomic (64B): + avg: AVG((TCC_EA_WRREQ_64B_sum / $denom)) + min: MIN((TCC_EA_WRREQ_64B_sum / $denom)) + max: MAX((TCC_EA_WRREQ_64B_sum / $denom)) + unit: (Req + $normUnit) + HBM Write and Atomic: + avg: AVG((TCC_EA_WRREQ_DRAM_sum / $denom)) + min: MIN((TCC_EA_WRREQ_DRAM_sum / $denom)) + max: MAX((TCC_EA_WRREQ_DRAM_sum / $denom)) + unit: (Req + $normUnit) + Remote Write and Atomic: + avg: AVG((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom)) + min: MIN((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom)) + max: MAX((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom)) + unit: (Req + $normUnit) + Atomic: + avg: AVG((TCC_EA_ATOMIC_sum / $denom)) + min: MIN((TCC_EA_ATOMIC_sum / $denom)) + max: MAX((TCC_EA_ATOMIC_sum / $denom)) + unit: (Req + $normUnit) + metrics_description: + Utilization: + plain: The ratio of the number of cycles an L2 channel was active, summed over + all L2 channels on the accelerator over the total L2 cycles. + rst: The ratio of the :ref:`number of cycles an L2 channel was active, summed + over all L2 channels on the accelerator ` over the + :ref:`total L2 cycles `. + unit: Percent + Peak Bandwidth: + plain: The number of bytes looked up in the L2 cache, as a percent of the peak + theoretical bandwidth achievable on the specific accelerator. The number of + bytes is calculated as the number of cache lines requested multiplied by the + cache line size. This value does not consider partial requests, so e.g., if + only a single value is requested in a cache line, the data movement will still + be counted as a full cache line. + rst: The number of bytes looked up in the L2 cache, as a percent of the peak theoretical + bandwidth achievable on the specific accelerator. The number of bytes is + calculated as the number of cache lines requested multiplied by the cache + line size. This value does not consider partial requests, so e.g., if only + a single value is requested in a cache line, the data movement will still + be counted as a full cache line. + unit: Percent + Hit Rate: + plain: The ratio of the number of L2 cache line requests that hit in the L2 + cache over the total number of incoming cache line requests to the L2 cache. + rst: The ratio of the number of L2 cache line requests that hit in the L2 cache + over the total number of incoming cache line requests to the L2 cache. + unit: Percent + L2-Fabric Read BW: + plain: The number of bytes read by the L2 over the Infinity Fabric interface + per unit time. + rst: The number of bytes read by the L2 over the :ref:`Infinity Fabric interface + ` per unit time. + unit: GB/s + L2-Fabric Write and Atomic BW: + plain: The number of bytes sent by the L2 over the Infinity Fabric interface + by write and atomic operations per unit time. + rst: The number of bytes sent by the L2 over the :ref:`Infinity Fabric interface + ` by write and atomic operations per unit time. + unit: GB/s + HBM Bandwidth: + plain: Maximum theoretical bandwidth of the accelerator's local high-bandwidth + memory (HBM) per unit time. This value is calculated as the number of HBM + channels multiplied by the HBM channel width multiplied by the HBM clock frequency. + rst: Maximum theoretical bandwidth of the accelerator's local high-bandwidth + memory (HBM) per unit time. This value is calculated as the number of HBM + channels multiplied by the HBM channel width multiplied by the HBM clock frequency. + unit: GB/s + Read BW: + plain: The total number of bytes read by the L2 cache from Infinity Fabric per + normalization unit. + rst: The total number of bytes read by the L2 cache from Infinity Fabric per :ref:`normalization + unit `. + unit: Bytes per normalization unit + HBM Read Traffic: + plain: The percent of read requests generated by the L2 cache that are routed + to the accelerator's local high-bandwidth memory (HBM). This breakdown does + not consider the size of the request (meaning that 32B and 64B requests are + both counted as a single request), so this metric only approximates the percent + of the L2-Fabric Read bandwidth directed to the local HBM. + rst: The percent of read requests generated by the L2 cache that are routed + to the accelerator's local high-bandwidth memory (HBM). This breakdown does not + consider the *size* of the request (meaning that 32B and 64B requests are + both counted as a single request), so this metric only *approximates* the + percent of the L2-Fabric Read bandwidth directed to the local HBM. + unit: Percent + Remote Read Traffic: + plain: The percent of read requests generated by the L2 cache that are routed + to any memory location other than the accelerator's local high-bandwidth memory + (HBM) - for example, the CPU's DRAM or a remote accelerator's HBM. This breakdown + does not consider the size of the request (meaning that 32B and 64B requests + are both counted as a single request), so this metric only approximates the + percent of the L2-Fabric Read bandwidth directed to a remote location. + rst: The percent of read requests generated by the L2 cache that are routed + to any memory location other than the accelerator's local high-bandwidth memory + (HBM) -- for example, the CPU's DRAM or a remote accelerator's HBM. This + breakdown does not consider the *size* of the request (meaning that 32B and + 64B requests are both counted as a single request), so this metric only *approximates* + the percent of the L2-Fabric Read bandwidth directed to a remote location. + unit: Percent + Uncached Read Traffic: + plain: The percent of read requests generated by the L2 cache that are reading + from an uncached memory allocation. Note, as described in the request flow + section, a single 64B read request is typically counted as two uncached read + requests. So, it is possible for the Uncached Read Traffic to reach up to + 200% of the total number of read requests. This breakdown does not consider + the size of the request (i.e., 32B and 64B requests are both counted as a + single request), so this metric only approximates the percent of the L2-Fabric + read bandwidth directed to an uncached memory location. + rst: The percent of read requests generated by the L2 cache that are reading from + an :ref:`uncached memory allocation `. Note, as described in + the :ref:`request flow ` section, a single 64B read request + is typically counted as two uncached read requests. So, it is possible for + the Uncached Read Traffic to reach up to 200% of the total number of read + requests. This breakdown does not consider the *size* of the request (i.e., + 32B and 64B requests are both counted as a single request), so this metric + only *approximates* the percent of the L2-Fabric read bandwidth directed + to an uncached memory location. + unit: Percent + Write and Atomic BW: + plain: The total number of bytes written by the L2 over Infinity Fabric by write + and atomic operations per normalization unit. Note that on current CDNA accelerators, + such as the MI2XX, requests are only considered atomic by Infinity Fabric + if they are targeted at non-write-cacheable memory, for example, fine-grained + memory allocations or uncached memory allocations on the MI2XX. + rst: The total number of bytes written by the L2 over Infinity Fabric by write and + atomic operations per :ref:`normalization unit `. Note + that on current CDNA accelerators, such as the :ref:`MI2XX `, + requests are only considered *atomic* by Infinity Fabric if they are targeted + at non-write-cacheable memory, for example, :ref:`fine-grained memory ` + allocations or :ref:`uncached memory ` allocations on the MI2XX. + unit: Bytes per normalization unit + HBM Write and Atomic Traffic: + plain: The percent of write and atomic requests generated by the L2 cache that + are routed to the accelerator's local high-bandwidth memory (HBM). This breakdown + does not consider the size of the request (meaning that 32B and 64B requests + are both counted as a single request), so this metric only approximates the + percent of the L2-Fabric Write and Atomic bandwidth directed to the local + HBM. Note that on current CDNA accelerators, such as the MI2XX, requests are + only considered atomic by Infinity Fabric if they are targeted at fine-grained + memory allocations or uncached memory allocations. + rst: The percent of write and atomic requests generated by the L2 cache that are + routed to the accelerator's local high-bandwidth memory (HBM). This breakdown + does not consider the *size* of the request (meaning that 32B and 64B requests + are both counted as a single request), so this metric only *approximates* + the percent of the L2-Fabric Write and Atomic bandwidth directed to the local + HBM. Note that on current CDNA accelerators, such as the :ref:`MI2XX `, + requests are only considered *atomic* by Infinity Fabric if they are targeted + at :ref:`fine-grained memory ` allocations or :ref:`uncached + memory ` allocations. + unit: Percent + Remote Write and Atomic Traffic: + plain: The percent of read requests generated by the L2 cache that are routed + to any memory location other than the accelerator's local high-bandwidth memory + (HBM) - for example, the CPU's DRAM or a remote accelerator's HBM. This breakdown + does not consider the size of the request (meaning that 32B and 64B requests + are both counted as a single request), so this metric only approximates the + percent of the L2-Fabric Read bandwidth directed to a remote location. Note + that on current CDNA accelerators, such as the MI2XX, requests are only considered + atomic by Infinity Fabric if they are targeted at fine-grained memory allocations + or uncached memory allocations. + rst: The percent of read requests generated by the L2 cache that are routed + to any memory location other than the accelerator's local high-bandwidth memory + (HBM) -- for example, the CPU's DRAM or a remote accelerator's HBM. This + breakdown does not consider the *size* of the request (meaning that 32B and + 64B requests are both counted as a single request), so this metric only *approximates* + the percent of the L2-Fabric Read bandwidth directed to a remote location. + Note that on current CDNA accelerators, such as the :ref:`MI2XX `, + requests are only considered *atomic* by Infinity Fabric if they are targeted + at :ref:`fine-grained memory ` allocations or :ref:`uncached + memory ` allocations. + unit: Percent + Atomic Traffic: + plain: The percent of write requests generated by the L2 cache that are atomic + requests to any memory location. This breakdown does not consider the size + of the request (meaning that 32B and 64B requests are both counted as a single + request), so this metric only approximates the percent of the L2-Fabric Read + bandwidth directed to a remote location. Note that on current CDNA accelerators, + such as the MI2XX, requests are only considered atomic by Infinity Fabric + if they are targeted at fine-grained memory allocations or uncached memory + allocations. + rst: The percent of write requests generated by the L2 cache that are atomic requests + to *any* memory location. This breakdown does not consider the *size* of + the request (meaning that 32B and 64B requests are both counted as a single + request), so this metric only *approximates* the percent of the L2-Fabric + Read bandwidth directed to a remote location. Note that on current CDNA accelerators, + such as the :ref:`MI2XX `, requests are only considered *atomic* + by Infinity Fabric if they are targeted at :ref:`fine-grained memory ` + allocations or :ref:`uncached memory ` allocations. + unit: Percent + Uncached Write and Atomic Traffic: + plain: The percent of write and atomic requests generated by the L2 cache that + are targeting uncached memory allocations. This breakdown does not consider + the size of the request (meaning that 32B and 64B requests are both counted + as a single request), so this metric only approximates the percent of the + L2-Fabric read bandwidth directed to uncached memory allocations. + rst: The percent of write and atomic requests generated by the L2 cache that are + targeting :ref:`uncached memory allocations `. This breakdown + does not consider the *size* of the request (meaning that 32B and 64B requests + are both counted as a single request), so this metric only *approximates* + the percent of the L2-Fabric read bandwidth directed to uncached memory allocations. + unit: Percent + Read Latency: + plain: The time-averaged number of cycles read requests spent in Infinity Fabric + before data was returned to the L2. + rst: The time-averaged number of cycles read requests spent in Infinity Fabric before + data was returned to the L2. + unit: Cycles + Write and Atomic Latency: + plain: The time-averaged number of cycles write requests spent in Infinity Fabric + before a completion acknowledgement was returned to the L2. + rst: The time-averaged number of cycles write requests spent in Infinity Fabric + before a completion acknowledgement was returned to the L2. + unit: Cycles + Atomic Latency: + plain: The time-averaged number of cycles atomic requests spent in Infinity + Fabric before a completion acknowledgement (atomic without return value) or + data (atomic with return value) was returned to the L2. + rst: The time-averaged number of cycles atomic requests spent in Infinity Fabric + before a completion acknowledgement (atomic without return value) or data + (atomic with return value) was returned to the L2. + unit: Cycles + Bandwidth: + plain: The number of bytes looked up in the L2 cache, per normalization unit. + The number of bytes is calculated as the number of cache lines requested multiplied + by the cache line size. This value does not consider partial requests, so + for example, if only a single value is requested in a cache line, the data + movement will still be counted as a full cache line. + rst: The number of bytes looked up in the L2 cache, per :ref:`normalization + unit `. The number of bytes is calculated as the number + of cache lines requested multiplied by the cache line size. This value does + not consider partial requests, so for example, if only a single value is + requested in a cache line, the data movement will still be counted as a full + cache line. + unit: Bytes per normalization unit + Req: + plain: The total number of incoming requests to the L2 from all clients for + all request types, per normalization unit. + rst: The total number of incoming requests to the L2 from all clients for all request + types, per :ref:`normalization unit `. + unit: Requests per normalization unit + Read Req: + plain: The total number of read requests to the L2 from all clients. + rst: 'The total number of read requests to the L2 from all clients. ' + unit: Requests per normalization unit + Write Req: + plain: The total number of write requests to the L2 from all clients. + rst: The total number of write requests to the L2 from all clients. + unit: Requests per normalization unit + Atomic Req: + plain: The total number of atomic requests (with and without return) to the + L2 from all clients. + rst: The total number of atomic requests (with and without return) to the L2 + from all clients. + unit: Requests per normalization unit + Streaming Req: + plain: The total number of incoming requests to the L2 that are marked as streaming. + The exact meaning of this may differ depending on the targeted accelerator, + however on an MI2XX this corresponds to non-temporal load or stores. The L2 + cache attempts to evict streaming requests before normal requests when the + L2 is at capacity. + rst: The total number of incoming requests to the L2 that are marked as *streaming*. + The exact meaning of this may differ depending on the targeted accelerator, + however on an :ref:`MI2XX ` this corresponds to `non-temporal + load or stores `_. The + L2 cache attempts to evict *streaming* requests before normal requests when + the L2 is at capacity. + unit: Requests per normalization unit + Probe Req: + plain: The number of coherence probe requests made to the L2 cache from outside + the accelerator. On an MI2XX, probe requests may be generated by, for example, + writes to fine-grained device memory or by writes to coarse-grained device + memory. + rst: The number of coherence probe requests made to the L2 cache from outside the + accelerator. On an :ref:`MI2XX `, probe requests may be generated + by, for example, writes to :ref:`fine-grained device ` memory + or by writes to :ref:`coarse-grained ` device memory. + unit: Requests per normalization unit + Cache Hit: + plain: The ratio of the number of L2 cache line requests that hit in the L2 + cache over the total number of incoming cache line requests to the L2 cache. + rst: The ratio of the number of L2 cache line requests that hit in the L2 cache + over the total number of incoming cache line requests to the L2 cache. + unit: Percent + Hits: + plain: The total number of requests to the L2 from all clients that hit in the + cache. As noted in the Speed-of-Light section, this includes hit-on-miss requests. + rst: The total number of requests to the L2 from all clients that hit in the cache. + As noted in the :ref:`Speed-of-Light ` section, this includes hit-on-miss + requests. + unit: Requests per normalization unit + Misses: + plain: The total number of requests to the L2 from all clients that miss in + the cache. As noted in the Speed-of-Light section, these do not include hit-on-miss + requests. + rst: The total number of requests to the L2 from all clients that miss in the cache. + As noted in the :ref:`Speed-of-Light ` section, these do not include + hit-on-miss requests. + unit: Requests per normalization unit + Writeback: + plain: The total number of L2 cache lines written back to memory for any reason. + Write-backs may occur due to user code (such as HIP kernel calls to _threadfence_system + or atomic built-ins) by the command processor's memory acquire/release fences, + or for other internal hardware reasons. + rst: The total number of L2 cache lines written back to memory for any reason. Write-backs + may occur due to user code (such as HIP kernel calls to ``__threadfence_system`` + or atomic built-ins) by the :doc:`command processor `'s + memory acquire/release fences, or for other internal hardware reasons. + unit: Cache lines per normalization unit + Writeback (Internal): + plain: The total number of L2 cache lines written back to memory for internal + hardware reasons, per normalization unit. + rst: The total number of L2 cache lines written back to memory for internal hardware + reasons, per :ref:`normalization unit `. + unit: Cache lines per normalization unit + Writeback (vL1D Req): + plain: The total number of L2 cache lines written back to memory due to requests + initiated by the vL1D cache, per normalization unit. + rst: The total number of L2 cache lines written back to memory due to requests initiated + by the :doc:`vL1D cache `, per :ref:`normalization unit + `. + unit: Cache lines per normalization unit + Evict (Internal): + plain: The total number of L2 cache lines evicted from the cache due to capacity + limits, per normalization unit. + rst: The total number of L2 cache lines evicted from the cache due to capacity limits, + per :ref:`normalization unit `. + unit: Cache lines per normalization unit + Evict (vL1D Req): + plain: The total number of L2 cache lines evicted from the cache due to invalidation + requests initiated by the vL1D cache, per normalization unit. + rst: The total number of L2 cache lines evicted from the cache due to invalidation + requests initiated by the :doc:`vL1D cache `, per :ref:`normalization + unit `. + unit: Cache lines per normalization unit + NC Req: + plain: The total number of requests to the L2 to Not-hardware-Coherent (NC) + memory allocations, per normalization unit. + rst: The total number of requests to the L2 to Not-hardware-Coherent (NC) memory + allocations, per :ref:`normalization unit `. See the + :ref:`memory-type` for more information. + unit: Requests per normalization unit + UC Req: + plain: The total number of requests to the L2 that go to Uncached (UC) memory + allocations. + rst: The total number of requests to the L2 that go to Uncached (UC) memory allocations. + See the :ref:`memory-type` for more information. + unit: Requests per normalization unit + CC Req: + plain: The total number of requests to the L2 that go to Coherently Cacheable + (CC) memory allocations. + rst: The total number of requests to the L2 that go to Coherently Cacheable + (CC) memory allocations. See the :ref:`memory-type` for more information. + unit: Requests per normalization unit + RW Req: + plain: The total number of requests to the L2 that go to Read-Write coherent + memory (RW) allocations. + rst: The total number of requests to the L2 that go to Read-Write coherent memory (RW) + allocations. See the :ref:`memory-type` for more information. + unit: Requests per normalization unit + Write - Credit Starvation: + plain: The number of cycles the L2-Fabric interface was stalled on write or + atomic requests to any memory location because too many write/atomic requests + were currently in flight, as a percent of the total active L2 cycles. + rst: The number of cycles the L2-Fabric interface was stalled on write or atomic + requests to any memory location because too many write/atomic requests were + currently in flight, as a percent of the :ref:`total active L2 cycles `. + unit: Percent + Read (32B): + plain: The total number of L2 requests to Infinity Fabric to read 32B of data + from any memory location, per normalization unit. + rst: The total number of L2 requests to Infinity Fabric to read 32B of data from + any memory location, per :ref:`normalization unit `. + See :ref:`l2-request-flow` for more detail. Typically unused on CDNA accelerators. + unit: Requests per normalization unit + Read (64B): + plain: The total number of L2 requests to Infinity Fabric to read 64B of data + from any memory location, per normalization unit. + rst: The total number of L2 requests to Infinity Fabric to read 64B of data from + any memory location, per :ref:`normalization unit `. + See :ref:`l2-request-flow` for more detail. + unit: Requests per normalization unit + Read (Uncached): + plain: The total number of L2 requests to Infinity Fabric to read uncached data + from any memory location, per normalization unit. 64B requests for uncached + data are counted as two 32B uncached data requests. + rst: The total number of L2 requests to Infinity Fabric to read :ref:`uncached + data ` from any memory location, per :ref:`normalization unit + `. 64B requests for uncached data are counted as two + 32B uncached data requests. See :ref:`l2-request-flow` for more detail. + unit: Requests per normalization unit + HBM Read: + plain: The total number of L2 requests to Infinity Fabric to read 32B or 64B + of data from the accelerator's local HBM, per normalization unit. + rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of data + from the accelerator's local HBM, per :ref:`normalization unit `. + See :ref:`l2-request-flow` for more detail. + unit: Requests per normalization unit + Remote Read: + plain: The total number of L2 requests to Infinity Fabric to read 32B or 64B + of data from any source other than the accelerator's local HBM, per normalization + unit. + rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of data + from any source other than the accelerator's local HBM, per :ref:`normalization + unit `. See :ref:`l2-request-flow` for more detail. + unit: Requests per normalization unit + Write and Atomic (32B): + plain: The total number of L2 requests to Infinity Fabric to write or atomically + update 32B of data to any memory location, per normalization unit. + rst: The total number of L2 requests to Infinity Fabric to write or atomically update + 32B of data to any memory location, per :ref:`normalization unit `. + See :ref:`l2-request-flow` for more detail. + unit: Requests per normalization unit + Write and Atomic (Uncached): + plain: The total number of L2 requests to Infinity Fabric to write or atomically + update 32B or 64B of uncached data, per normalization unit. + rst: The total number of L2 requests to Infinity Fabric to write or atomically update + 32B or 64B of :ref:`uncached data `, per :ref:`normalization + unit `. See :ref:`l2-request-flow` for more detail. + unit: Requests per normalization unit + Write and Atomic (64B): + plain: The total number of L2 requests to Infinity Fabric to write or atomically + update 64B of data in any memory location, per normalization unit. + rst: The total number of L2 requests to Infinity Fabric to write or atomically update + 64B of data in any memory location, per :ref:`normalization unit `. + See :ref:`l2-request-flow` for more detail. + unit: Requests per normalization unit + HBM Write and Atomic: + plain: The total number of L2 requests to Infinity Fabric to write or atomically + update 32B or 64B of data in the accelerator's local HBM, per normalization + unit. + rst: The total number of L2 requests to Infinity Fabric to write or atomically update + 32B or 64B of data in the accelerator's local HBM, per :ref:`normalization + unit `. See :ref:`l2-request-flow` for more detail. plain + unit: Requests per normalization unit + Remote Write and Atomic: + plain: The total number of L2 requests to Infinity Fabric to write or atomically + update 32B or 64B of data in any memory location other than the accelerator's + local HBM, per normalization unit. + rst: The total number of L2 requests to Infinity Fabric to write or atomically update + 32B or 64B of data in any memory location other than the accelerator's local + HBM, per :ref:`normalization unit `. See :ref:`l2-request-flow` + for more detail. + unit: Requests per normalization unit + Atomic: + plain: The total number of L2 requests to Infinity Fabric to atomically update + 32B or 64B of data in any memory location, per normalization unit. See Request + flow for more detail. Note that on current CDNA accelerators, such as the + MI2XX, requests are only considered atomic by Infinity Fabric if they are + targeted at non-write-cacheable memory, such as fine-grained memory allocations + or uncached memory allocations on the MI2XX. + rst: The total number of L2 requests to Infinity Fabric to atomically update 32B + or 64B of data in any memory location, per :ref:`normalization unit `. + See :ref:`l2-request-flow` for more detail. Note that on current CDNA accelerators, + such as the :ref:`MI2XX `, requests are only considered *atomic* + by Infinity Fabric if they are targeted at non-write-cacheable memory, such + as :ref:`fine-grained memory ` allocations or :ref:`uncached + memory ` allocations on the MI2XX. + unit: Requests per normalization unit + Read Stall: + plain: "The ratio of the total number of cycles the L2-Fabric interface was\ + \ stalled on a read request to any destination (local HBM, remote PCIe\xAE\ + \ connected accelerator or CPU, or remote Infinity Fabric connected accelerator\ + \ or CPU) over the total active L2 cycles." + rst: "The ratio of the total number of cycles the L2-Fabric interface was stalled\ + \ on a read request to any destination (local HBM, remote PCIe\xAE connected\ + \ accelerator or CPU, or remote Infinity Fabric connected accelerator [#inf]_\ + \ or CPU) over the :ref:`total active L2 cycles `." + unit: Percent + Write Stall: + plain: The ratio of the total number of cycles the L2-Fabric interface was stalled + on a write or atomic request to any destination (local HBM, remote accelerator + or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected + accelerator or CPU) over the total active L2 cycles. + rst: The ratio of the total number of cycles the L2-Fabric interface was stalled + on a write or atomic request to any destination (local HBM, remote accelerator + or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected + accelerator [#inf]_ or CPU) over the :ref:`total active L2 cycles `. + unit: Percent + Read - PCIe Stall: + plain: The number of cycles the L2-Fabric interface was stalled on read requests + to remote PCIe connected accelerators or CPUs as a percent of the total active + L2 cycles. + rst: The number of cycles the L2-Fabric interface was stalled on read requests + to remote PCIe connected accelerators [#inf]_ or CPUs as a percent of the + :ref:`total active L2 cycles `. + unit: Percent + Read - Infinity Fabric Stall: + plain: The number of cycles the L2-Fabric interface was stalled on read requests + to remote Infinity Fabric connected accelerators or CPUs as a percent of the + total active L2 cycles. + rst: The number of cycles the L2-Fabric interface was stalled on read requests + to remote Infinity Fabric connected accelerators [#inf]_ or CPUs as a percent + of the :ref:`total active L2 cycles `. + unit: Percent + Read - HBM Stall: + plain: The number of cycles the L2-Fabric interface was stalled on read requests + to the accelerator's local HBM as a percent of the total active L2 cycles. + rst: The number of cycles the L2-Fabric interface was stalled on read requests + to the accelerator's local HBM as a percent of the :ref:`total active L2 cycles + `. + unit: Percent + Write - PCIe Stall: + plain: The number of cycles the L2-Fabric interface was stalled on write or + atomic requests to remote PCIe connected accelerators or CPUs as a percent + of the total active L2 cycles. + rst: The number of cycles the L2-Fabric interface was stalled on write or atomic + requests to remote PCIe connected accelerators [#inf]_ or CPUs as a percent + of the :ref:`total active L2 cycles `. + unit: Percent + Write - Infinity Fabric Stall: + plain: The number of cycles the L2-Fabric interface was stalled on write or + atomic requests to remote Infinity Fabric connected accelerators or CPUs as + a percent of the total active L2 cycles. + rst: The number of cycles the L2-Fabric interface was stalled on write or atomic + requests to remote Infinity Fabric connected accelerators [#inf]_ or CPUs + as a percent of the :ref:`total active L2 cycles `. + unit: Percent + Write - HBM Stall: + plain: The number of cycles the L2-Fabric interface was stalled on write or + atomic requests to accelerator's local HBM as a percent of the total active + L2 cycles. + rst: The number of cycles the L2-Fabric interface was stalled on write or atomic + requests to accelerator's local HBM as a percent of the total active L2 cycles. + unit: Percent +- id: 1800 + title: L2 Cache (per Channel) + data source: + - metric_table: + id: 1801 + title: Aggregate Stats (All channels) + header: + metric: Metric + avg: Avg + std dev: Std Dev + min: Min + max: Max + unit: Unit + metric: + gfx90a: + L2 Cache Hit Rate: + avg: AVG(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 * + TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 + * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 + * TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) + (100 + * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) + (100 + * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) + (100 + * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + (100 + * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + (100 + * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[29] + TCC_HIT[29])) + (TCC_MISS[30] + + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None)) + std dev: STD(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 + * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * + TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) + + (100 * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) + + (100 * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) + + (100 * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + + (100 * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + + (100 * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / + ((((((((((((((((((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) + (TCC_MISS[16] + + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + (TCC_MISS[18] + TCC_HIT[18])) + + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + TCC_HIT[20])) + (TCC_MISS[21] + + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + (TCC_MISS[23] + TCC_HIT[23])) + + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + TCC_HIT[25])) + (TCC_MISS[26] + + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + (TCC_MISS[28] + TCC_HIT[28])) + + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + TCC_HIT[30])) + (TCC_MISS[31] + + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) + + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + (TCC_MISS[18] + + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + TCC_HIT[20])) + + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + (TCC_MISS[23] + + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + TCC_HIT[25])) + + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + (TCC_MISS[28] + + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + TCC_HIT[30])) + + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None)) + min: MIN(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 * + TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 + * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 + * TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) + (100 + * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) + (100 + * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) + (100 + * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + (100 + * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + (100 + * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None)) + max: MAX(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 * + TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 + * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 + * TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) + (100 + * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) + (100 + * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) + (100 + * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + (100 + * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + (100 + * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None)) + unit: pct + gfx941: + L2 Cache Hit Rate: + avg: AVG(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 + * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * + TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) + if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) + std dev: STD(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 + * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * + TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) + if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) + min: MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 + * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * + TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) + if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) + max: MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 + * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * + TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) + if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) + unit: pct + gfx940: + L2 Cache Hit Rate: + avg: AVG(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 + * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * + TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) + if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) + std dev: STD(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 + * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * + TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) + if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) + min: MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 + * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * + TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) + if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) + max: MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 + * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * + TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) + if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) + unit: pct + gfx942: + L2 Cache Hit Rate: + avg: AVG(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 + * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * + TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) + if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) + std dev: STD(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 + * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * + TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) + if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) + min: MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 + * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * + TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) + if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) + max: MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 + * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * + TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) + if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) + unit: pct + gfx950: + L2 Cache Hit Rate: + avg: AVG(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 + * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * + TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) + if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) + std dev: STD(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 + * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * + TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) + if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) + min: MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 + * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * + TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) + if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) + max: MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 + * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * + TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) + if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) + unit: pct + gfx908: + L2 Cache Hit Rate: + avg: AVG(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 * + TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 + * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 + * TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) + (100 + * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) + (100 + * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) + (100 + * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + (100 + * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + (100 + * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[29] + TCC_HIT[29])) + (TCC_MISS[30] + + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None)) + std dev: STD(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 + * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * + TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) + + (100 * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) + + (100 * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) + + (100 * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + + (100 * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + + (100 * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / + ((((((((((((((((((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) + (TCC_MISS[16] + + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + (TCC_MISS[18] + TCC_HIT[18])) + + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + TCC_HIT[20])) + (TCC_MISS[21] + + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + (TCC_MISS[23] + TCC_HIT[23])) + + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + TCC_HIT[25])) + (TCC_MISS[26] + + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + (TCC_MISS[28] + TCC_HIT[28])) + + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + TCC_HIT[30])) + (TCC_MISS[31] + + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) + + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + (TCC_MISS[18] + + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + TCC_HIT[20])) + + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + (TCC_MISS[23] + + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + TCC_HIT[25])) + + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + (TCC_MISS[28] + + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + TCC_HIT[30])) + + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None)) + min: MIN(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 * + TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 + * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 + * TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) + (100 + * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) + (100 + * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) + (100 + * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + (100 + * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + (100 + * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None)) + max: MAX(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 * + TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 + * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 + * TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) + (100 + * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) + (100 + * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) + (100 + * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + (100 + * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + (100 + * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0] + + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None)) + unit: pct + - metric_table: + id: 1802 + title: L2 Cache Hit Rate (pct) + header: + metric: Channel + expr: Expression + metric: + gfx90a: + ::_1: + expr: (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1] + + TCC_MISS[::_1]) != 0) else None) + placeholder_range: + ::_1: $total_l2_chan + gfx941: + ::_1: + expr: (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1] + + TCC_MISS[::_1]) != 0) else None) + placeholder_range: + ::_1: $total_l2_chan + gfx940: + ::_1: + expr: (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1] + + TCC_MISS[::_1]) != 0) else None) + placeholder_range: + ::_1: $total_l2_chan + gfx942: + ::_1: + expr: (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1] + + TCC_MISS[::_1]) != 0) else None) + placeholder_range: + ::_1: $total_l2_chan + gfx950: + ::_1: + expr: (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1] + + TCC_MISS[::_1]) != 0) else None) + placeholder_range: + ::_1: $total_l2_chan + gfx908: + ::_1: + expr: (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1] + + TCC_MISS[::_1]) != 0) else None) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1803 + title: L2 Requests (per normUnit) + header: + metric: Channel + expr: Expression + metric: + gfx90a: + ::_1: + expr: (TO_INT(TCC_REQ[::_1]) / $denom) + placeholder_range: + ::_1: $total_l2_chan + gfx941: + ::_1: + expr: (TO_INT(TCC_REQ[::_1]) / $denom) + placeholder_range: + ::_1: $total_l2_chan + gfx940: + ::_1: + expr: (TO_INT(TCC_REQ[::_1]) / $denom) + placeholder_range: + ::_1: $total_l2_chan + gfx942: + ::_1: + expr: (TO_INT(TCC_REQ[::_1]) / $denom) + placeholder_range: + ::_1: $total_l2_chan + gfx950: + ::_1: + expr: (TO_INT(TCC_REQ[::_1]) / $denom) + placeholder_range: + ::_1: $total_l2_chan + gfx908: + ::_1: + expr: (TO_INT(TCC_REQ[::_1]) / $denom) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1804 + title: L2 Requests (per normUnit) + header: + metric: Channel + read req: L2 Read + write req: L2 Write + atomic req: L2 Atomic + metric: + gfx90a: + ::_1: + read req: AVG((TO_INT(TCC_READ[::_1]) / $denom)) + write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom)) + atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom)) + placeholder_range: + ::_1: $total_l2_chan + gfx941: + ::_1: + read req: AVG((TO_INT(TCC_READ[::_1]) / $denom)) + write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom)) + atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom)) + placeholder_range: + ::_1: $total_l2_chan + gfx940: + ::_1: + read req: AVG((TO_INT(TCC_READ[::_1]) / $denom)) + write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom)) + atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom)) + placeholder_range: + ::_1: $total_l2_chan + gfx942: + ::_1: + read req: AVG((TO_INT(TCC_READ[::_1]) / $denom)) + write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom)) + atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom)) + placeholder_range: + ::_1: $total_l2_chan + gfx950: + ::_1: + read req: AVG((TO_INT(TCC_READ[::_1]) / $denom)) + write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom)) + atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom)) + placeholder_range: + ::_1: $total_l2_chan + gfx908: + ::_1: + read req: AVG((TO_INT(TCC_READ[::_1]) / $denom)) + write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom)) + atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom)) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_multiple_bar + tui_style: simple_multiple_bar + - metric_table: + id: 1805 + title: L2-Fabric Requests (per normUnit) + header: + metric: Channel + read req: L2-Fabric Read + write req: L2-Fabric Write and Atomic + atomic req: L2-Fabric Atomic + metric: + gfx90a: + ::_1: + read req: AVG((TO_INT(TCC_EA_RDREQ[::_1]) / $denom)) + write req: AVG((TO_INT(TCC_EA_WRREQ[::_1]) / $denom)) + atomic req: AVG((TO_INT(TCC_EA_ATOMIC[::_1]) / $denom)) + placeholder_range: + ::_1: $total_l2_chan + gfx941: + ::_1: + read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom)) + write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom)) + atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom)) + placeholder_range: + ::_1: $total_l2_chan + gfx940: + ::_1: + read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom)) + write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom)) + atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom)) + placeholder_range: + ::_1: $total_l2_chan + gfx942: + ::_1: + read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom)) + write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom)) + atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom)) + placeholder_range: + ::_1: $total_l2_chan + gfx950: + ::_1: + read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom)) + write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom)) + atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom)) + placeholder_range: + ::_1: $total_l2_chan + gfx908: + ::_1: + read req: AVG((TO_INT(TCC_EA_RDREQ[::_1]) / $denom)) + write req: AVG((TO_INT(TCC_EA_WRREQ[::_1]) / $denom)) + atomic req: AVG((TO_INT(TCC_EA_ATOMIC[::_1]) / $denom)) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_multiple_bar + tui_style: simple_multiple_bar + - metric_table: + id: 1806 + title: L2-Fabric Read Latency (Cycles) + header: + metric: Channel + expr: Expression + metric: + gfx90a: + ::_1: + expr: ((TCC_EA_RDREQ_LEVEL[::_1] / TCC_EA_RDREQ[::_1]) if (TCC_EA_RDREQ[::_1] + != 0) else None) + placeholder_range: + ::_1: $total_l2_chan + gfx941: + ::_1: + expr: ((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1] + != 0) else None) + placeholder_range: + ::_1: $total_l2_chan + gfx940: + ::_1: + expr: ((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1] + != 0) else None) + placeholder_range: + ::_1: $total_l2_chan + gfx942: + ::_1: + expr: ((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1] + != 0) else None) + placeholder_range: + ::_1: $total_l2_chan + gfx950: + ::_1: + expr: ((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1] + != 0) else None) + placeholder_range: + ::_1: $total_l2_chan + gfx908: + ::_1: + expr: ((TCC_EA_RDREQ_LEVEL[::_1] / TCC_EA_RDREQ[::_1]) if (TCC_EA_RDREQ[::_1] + != 0) else None) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1807 + title: L2-Fabric Write and Atomic Latency (Cycles) + header: + metric: Channel + expr: Expression + metric: + gfx90a: + ::_1: + expr: ((TCC_EA_WRREQ_LEVEL[::_1] / TCC_EA_WRREQ[::_1]) if (TCC_EA_WRREQ[::_1] + != 0) else None) + placeholder_range: + ::_1: $total_l2_chan + gfx941: + ::_1: + expr: ((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1] + != 0) else None) + placeholder_range: + ::_1: $total_l2_chan + gfx940: + ::_1: + expr: ((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1] + != 0) else None) + placeholder_range: + ::_1: $total_l2_chan + gfx942: + ::_1: + expr: ((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1] + != 0) else None) + placeholder_range: + ::_1: $total_l2_chan + gfx950: + ::_1: + expr: ((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1] + != 0) else None) + placeholder_range: + ::_1: $total_l2_chan + gfx908: + ::_1: + expr: ((TCC_EA_WRREQ_LEVEL[::_1] / TCC_EA_WRREQ[::_1]) if (TCC_EA_WRREQ[::_1] + != 0) else None) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1808 + title: L2-Fabric Atomic Latency (Cycles) + header: + metric: Channel + expr: Expression + metric: + gfx90a: + ::_1: + expr: ((TCC_EA_ATOMIC_LEVEL[::_1] / TCC_EA_ATOMIC[::_1]) if (TCC_EA_ATOMIC[::_1] + != 0) else 0) + placeholder_range: + ::_1: $total_l2_chan + gfx941: + ::_1: + expr: ((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if (TCC_EA0_ATOMIC[::_1] + != 0) else 0) + placeholder_range: + ::_1: $total_l2_chan + gfx940: + ::_1: + expr: ((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if (TCC_EA0_ATOMIC[::_1] + != 0) else 0) + placeholder_range: + ::_1: $total_l2_chan + gfx942: + ::_1: + expr: ((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if (TCC_EA0_ATOMIC[::_1] + != 0) else 0) + placeholder_range: + ::_1: $total_l2_chan + gfx950: + ::_1: + expr: ((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if (TCC_EA0_ATOMIC[::_1] + != 0) else 0) + placeholder_range: + ::_1: $total_l2_chan + gfx908: + ::_1: + expr: ((TCC_EA_ATOMIC_LEVEL[::_1] / TCC_EA_ATOMIC[::_1]) if (TCC_EA_ATOMIC[::_1] + != 0) else 0) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box + - metric_table: + id: 1809 + title: L2-Fabric Read Stall (Cycles per normUnit) + header: + metric: Channel + ea read stall - pcie: L2-Fabric Read Stall (PCIe) + ea read stall - if: "L2-Fabric Read Stall (Infinity Fabric\u2122)" + ea read stall - hbm: L2-Fabric Read Stall (HBM) + metric: + gfx90a: + ::_1: + ea read stall - pcie: None + ea read stall - if: None + ea read stall - hbm: None + placeholder_range: + ::_1: $total_l2_chan + gfx941: + ::_1: + ea read stall - pcie: None + ea read stall - if: None + ea read stall - hbm: None + placeholder_range: + ::_1: $total_l2_chan + gfx940: + ::_1: + ea read stall - pcie: None + ea read stall - if: None + ea read stall - hbm: None + placeholder_range: + ::_1: $total_l2_chan + gfx942: + ::_1: + ea read stall - pcie: None + ea read stall - if: None + ea read stall - hbm: None + placeholder_range: + ::_1: $total_l2_chan + gfx950: + ::_1: + ea read stall - pcie: AVG((TO_INT(TCC_EA0_RDREQ_IO_CREDIT_STALL[::_1]) + / $denom)) + ea read stall - if: AVG((TO_INT(TCC_EA0_RDREQ_GMI_CREDIT_STALL[::_1]) + / $denom)) + ea read stall - hbm: AVG((TO_INT(TCC_EA0_RDREQ_DRAM_CREDIT_STALL[::_1]) + / $denom)) + placeholder_range: + ::_1: $total_l2_chan + gfx908: + ::_1: + ea read stall - pcie: None + ea read stall - if: None + ea read stall - hbm: None + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_multiple_bar + tui_style: simple_multiple_bar + - metric_table: + id: 1810 + title: L2-Fabric Write and Atomic Stall (Cycles per normUnit) + header: + metric: Channel + ea write stall - pcie: L2-Fabric Write Stall (PCIe) + ea write stall - if: "L2-Fabric Write Stall (Infinity Fabric\u2122)" + ea write stall - hbm: L2-Fabric Write Stall (HBM) + ea write stall - starve: L2-Fabric Write Starve + metric: + gfx90a: + ::_1: + ea write stall - pcie: None + ea write stall - if: None + ea write stall - hbm: None + ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1]) + / $denom)) + placeholder_range: + ::_1: $total_l2_chan + gfx941: + ::_1: + ea write stall - pcie: None + ea write stall - if: None + ea write stall - hbm: None + ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1]) + / $denom)) + placeholder_range: + ::_1: $total_l2_chan + gfx940: + ::_1: + ea write stall - pcie: None + ea write stall - if: None + ea write stall - hbm: None + ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1]) + / $denom)) + placeholder_range: + ::_1: $total_l2_chan + gfx942: + ::_1: + ea write stall - pcie: None + ea write stall - if: None + ea write stall - hbm: None + ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1]) + / $denom)) + placeholder_range: + ::_1: $total_l2_chan + gfx950: + ::_1: + ea write stall - pcie: AVG((TO_INT(TCC_EA0_WRREQ_IO_CREDIT_STALL[::_1]) + / $denom)) + ea write stall - if: AVG((TO_INT(TCC_EA0_WRREQ_GMI_CREDIT_STALL[::_1]) + / $denom)) + ea write stall - hbm: AVG((TO_INT(TCC_EA0_WRREQ_DRAM_CREDIT_STALL[::_1]) + / $denom)) + ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1]) + / $denom)) + placeholder_range: + ::_1: $total_l2_chan + gfx908: + ::_1: + ea write stall - pcie: None + ea write stall - if: None + ea write stall - hbm: None + ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1]) + / $denom)) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_multiple_bar + tui_style: simple_multiple_bar + - metric_table: + id: 1812 + title: L2-Fabric (128B read requests per normUnit) + header: + metric: Channel + expr: Expression + metric: + gfx90a: + ::_1: + expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom) + placeholder_range: + ::_1: $total_l2_chan + gfx941: + ::_1: + expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom) + placeholder_range: + ::_1: $total_l2_chan + gfx940: + ::_1: + expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom) + placeholder_range: + ::_1: $total_l2_chan + gfx942: + ::_1: + expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom) + placeholder_range: + ::_1: $total_l2_chan + gfx950: + ::_1: + expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom) + placeholder_range: + ::_1: $total_l2_chan + gfx908: + ::_1: + expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom) + placeholder_range: + ::_1: $total_l2_chan + cli_style: simple_box + tui_style: simple_box + metrics_description: + L2 Cache Hit Rate: + plain: The percent of total number of requests to the L2 from all clients that + hit in the cache. As noted in the Speed-of-Light section, this includes hit-on-miss + requests. + rst: The total number of requests to the L2 from all clients that hit in the cache. + As noted in the :ref:`Speed-of-Light ` section, this includes hit-on-miss + requests. + unit: Percent +- id: 2100 + title: PC Sampling + data source: + - pc_sampling_table: + id: 2101 + title: PC Sampling + source: ps_file + comparable: false